Packages

Install and load required packages, if needed.

if(!require('tidyverse')){install.packages('tidyverse')} # For factor collapsing
library(tidyverse)
if(!require('ggplot2')){install.packages('ggplot2')} # For visualizations
library(ggplot2)
if(!require('gridExtra')){install.packages('gridExtra')} # For visualizations
library(gridExtra)
if(!require('corrplot')){install.packages('corrplot')} # For visualizations
library(corrplot)
if(!require('Hmisc')){install.packages('Hmisc')} # For rcorr function
library(Hmisc)
if(!require('ggcorrplot')){install.packages('ggcorrplot')} # For correlation heatmap
library(ggcorrplot)
if(!require('dplyr')){install.packages('dplyr')} # For grouping
library(dplyr)
if(!require('mltools')){install.packages('mltools')} # To create dummy variables
library(mltools)
if(!require('data.table')){install.packages('data.table')} # To create dummy variables
library(data.table)

Initial Data Cleaning

First import the raw data, and save it to the object db.raw

db.raw <- read.csv('/Users/amyhowe/Desktop/diabetic_data.csv')

Let’s take a look at the structure of the dataset, and identify if there are any duplicated rows or “NA” values.

str(db.raw) # There are 50 variables and 101766 observations
## 'data.frame':    101766 obs. of  50 variables:
##  $ encounter_id            : int  2278392 149190 64410 500364 16680 35754 55842 63768 12522 15738 ...
##  $ patient_nbr             : int  8222157 55629189 86047875 82442376 42519267 82637451 84259809 114882984 48330783 63555939 ...
##  $ race                    : Factor w/ 6 levels "?","AfricanAmerican",..: 4 4 2 4 4 4 4 4 4 4 ...
##  $ gender                  : Factor w/ 3 levels "Female","Male",..: 1 1 1 2 2 2 2 2 1 1 ...
##  $ age                     : Factor w/ 10 levels "[0-10)","[10-20)",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ weight                  : Factor w/ 10 levels "?","[0-25)","[100-125)",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ admission_type_id       : int  6 1 1 1 1 2 3 1 2 3 ...
##  $ discharge_disposition_id: int  25 1 1 1 1 1 1 1 1 3 ...
##  $ admission_source_id     : int  1 7 7 7 7 2 2 7 4 4 ...
##  $ time_in_hospital        : int  1 3 2 2 1 3 4 5 13 12 ...
##  $ payer_code              : Factor w/ 18 levels "?","BC","CH",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ medical_specialty       : Factor w/ 73 levels "?","AllergyandImmunology",..: 39 1 1 1 1 1 1 1 1 20 ...
##  $ num_lab_procedures      : int  41 59 11 44 51 31 70 73 68 33 ...
##  $ num_procedures          : int  0 0 5 1 0 6 1 0 2 3 ...
##  $ num_medications         : int  1 18 13 16 8 16 21 12 28 18 ...
##  $ number_outpatient       : int  0 0 2 0 0 0 0 0 0 0 ...
##  $ number_emergency        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_inpatient        : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ diag_1                  : Factor w/ 717 levels "?","10","11",..: 126 145 456 556 56 265 265 278 254 284 ...
##  $ diag_2                  : Factor w/ 749 levels "?","11","110",..: 1 81 80 99 26 248 248 316 262 48 ...
##  $ diag_3                  : Factor w/ 790 levels "?","11","110",..: 1 123 768 250 88 88 772 88 231 319 ...
##  $ number_diagnoses        : int  1 9 6 7 5 9 7 8 8 8 ...
##  $ max_glu_serum           : Factor w/ 4 levels ">200",">300",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ A1Cresult               : Factor w/ 4 levels ">7",">8","None",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ metformin               : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 3 2 2 2 ...
##  $ repaglinide             : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ nateglinide             : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ chlorpropamide          : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ glimepiride             : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 3 2 2 2 ...
##  $ acetohexamide           : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ glipizide               : Factor w/ 4 levels "Down","No","Steady",..: 2 2 3 2 3 2 2 2 3 2 ...
##  $ glyburide               : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 3 2 2 ...
##  $ tolbutamide             : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ pioglitazone            : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ rosiglitazone           : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 3 ...
##  $ acarbose                : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ miglitol                : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ troglitazone            : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ tolazamide              : Factor w/ 3 levels "No","Steady",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ examide                 : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1 ...
##  $ citoglipton             : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1 ...
##  $ insulin                 : Factor w/ 4 levels "Down","No","Steady",..: 2 4 2 4 3 3 3 2 3 3 ...
##  $ glyburide.metformin     : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ glipizide.metformin     : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ glimepiride.pioglitazone: Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ metformin.rosiglitazone : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ metformin.pioglitazone  : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ change                  : Factor w/ 2 levels "Ch","No": 2 1 2 1 1 2 1 2 1 1 ...
##  $ diabetesMed             : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 2 2 2 ...
##  $ readmitted              : Factor w/ 3 levels "<30",">30","NO": 3 2 3 3 3 2 3 2 3 3 ...
length(subset(duplicated(db.raw), duplicated(db.raw) == TRUE)) # The output is 0, indicating there are no fully duplicated rows
## [1] 0
sum(is.na(db.raw)) # There are no "NA" values in entire dataset
## [1] 0

Convert Attributes to Correct Data Types

Now we will convert attributes to the correct data types, and add in any missing level names.

db.init <- db.raw # Put the dataset in a new object for initial analysis and cleaning
sapply(db.init[,c(3:6,11:12,23:50)], levels) # The levels of all factor attributes except for the diagnoses were checked here for overlap - none present
## $race
## [1] "?"               "AfricanAmerican" "Asian"           "Caucasian"      
## [5] "Hispanic"        "Other"          
## 
## $gender
## [1] "Female"          "Male"            "Unknown/Invalid"
## 
## $age
##  [1] "[0-10)"   "[10-20)"  "[20-30)"  "[30-40)"  "[40-50)"  "[50-60)" 
##  [7] "[60-70)"  "[70-80)"  "[80-90)"  "[90-100)"
## 
## $weight
##  [1] "?"         "[0-25)"    "[100-125)" "[125-150)" "[150-175)" "[175-200)"
##  [7] "[25-50)"   "[50-75)"   "[75-100)"  ">200"     
## 
## $payer_code
##  [1] "?"  "BC" "CH" "CM" "CP" "DM" "FR" "HM" "MC" "MD" "MP" "OG" "OT" "PO" "SI"
## [16] "SP" "UN" "WC"
## 
## $medical_specialty
##  [1] "?"                                   
##  [2] "AllergyandImmunology"                
##  [3] "Anesthesiology"                      
##  [4] "Anesthesiology-Pediatric"            
##  [5] "Cardiology"                          
##  [6] "Cardiology-Pediatric"                
##  [7] "DCPTEAM"                             
##  [8] "Dentistry"                           
##  [9] "Dermatology"                         
## [10] "Emergency/Trauma"                    
## [11] "Endocrinology"                       
## [12] "Endocrinology-Metabolism"            
## [13] "Family/GeneralPractice"              
## [14] "Gastroenterology"                    
## [15] "Gynecology"                          
## [16] "Hematology"                          
## [17] "Hematology/Oncology"                 
## [18] "Hospitalist"                         
## [19] "InfectiousDiseases"                  
## [20] "InternalMedicine"                    
## [21] "Nephrology"                          
## [22] "Neurology"                           
## [23] "Neurophysiology"                     
## [24] "Obsterics&Gynecology-GynecologicOnco"
## [25] "Obstetrics"                          
## [26] "ObstetricsandGynecology"             
## [27] "Oncology"                            
## [28] "Ophthalmology"                       
## [29] "Orthopedics"                         
## [30] "Orthopedics-Reconstructive"          
## [31] "Osteopath"                           
## [32] "Otolaryngology"                      
## [33] "OutreachServices"                    
## [34] "Pathology"                           
## [35] "Pediatrics"                          
## [36] "Pediatrics-AllergyandImmunology"     
## [37] "Pediatrics-CriticalCare"             
## [38] "Pediatrics-EmergencyMedicine"        
## [39] "Pediatrics-Endocrinology"            
## [40] "Pediatrics-Hematology-Oncology"      
## [41] "Pediatrics-InfectiousDiseases"       
## [42] "Pediatrics-Neurology"                
## [43] "Pediatrics-Pulmonology"              
## [44] "Perinatology"                        
## [45] "PhysicalMedicineandRehabilitation"   
## [46] "PhysicianNotFound"                   
## [47] "Podiatry"                            
## [48] "Proctology"                          
## [49] "Psychiatry"                          
## [50] "Psychiatry-Addictive"                
## [51] "Psychiatry-Child/Adolescent"         
## [52] "Psychology"                          
## [53] "Pulmonology"                         
## [54] "Radiologist"                         
## [55] "Radiology"                           
## [56] "Resident"                            
## [57] "Rheumatology"                        
## [58] "Speech"                              
## [59] "SportsMedicine"                      
## [60] "Surgeon"                             
## [61] "Surgery-Cardiovascular"              
## [62] "Surgery-Cardiovascular/Thoracic"     
## [63] "Surgery-Colon&Rectal"                
## [64] "Surgery-General"                     
## [65] "Surgery-Maxillofacial"               
## [66] "Surgery-Neuro"                       
## [67] "Surgery-Pediatric"                   
## [68] "Surgery-Plastic"                     
## [69] "Surgery-PlasticwithinHeadandNeck"    
## [70] "Surgery-Thoracic"                    
## [71] "Surgery-Vascular"                    
## [72] "SurgicalSpecialty"                   
## [73] "Urology"                             
## 
## $max_glu_serum
## [1] ">200" ">300" "None" "Norm"
## 
## $A1Cresult
## [1] ">7"   ">8"   "None" "Norm"
## 
## $metformin
## [1] "Down"   "No"     "Steady" "Up"    
## 
## $repaglinide
## [1] "Down"   "No"     "Steady" "Up"    
## 
## $nateglinide
## [1] "Down"   "No"     "Steady" "Up"    
## 
## $chlorpropamide
## [1] "Down"   "No"     "Steady" "Up"    
## 
## $glimepiride
## [1] "Down"   "No"     "Steady" "Up"    
## 
## $acetohexamide
## [1] "No"     "Steady"
## 
## $glipizide
## [1] "Down"   "No"     "Steady" "Up"    
## 
## $glyburide
## [1] "Down"   "No"     "Steady" "Up"    
## 
## $tolbutamide
## [1] "No"     "Steady"
## 
## $pioglitazone
## [1] "Down"   "No"     "Steady" "Up"    
## 
## $rosiglitazone
## [1] "Down"   "No"     "Steady" "Up"    
## 
## $acarbose
## [1] "Down"   "No"     "Steady" "Up"    
## 
## $miglitol
## [1] "Down"   "No"     "Steady" "Up"    
## 
## $troglitazone
## [1] "No"     "Steady"
## 
## $tolazamide
## [1] "No"     "Steady" "Up"    
## 
## $examide
## [1] "No"
## 
## $citoglipton
## [1] "No"
## 
## $insulin
## [1] "Down"   "No"     "Steady" "Up"    
## 
## $glyburide.metformin
## [1] "Down"   "No"     "Steady" "Up"    
## 
## $glipizide.metformin
## [1] "No"     "Steady"
## 
## $glimepiride.pioglitazone
## [1] "No"     "Steady"
## 
## $metformin.rosiglitazone
## [1] "No"     "Steady"
## 
## $metformin.pioglitazone
## [1] "No"     "Steady"
## 
## $change
## [1] "Ch" "No"
## 
## $diabetesMed
## [1] "No"  "Yes"
## 
## $readmitted
## [1] "<30" ">30" "NO"
db.init$weight <- factor(db.init$weight, levels = c("?", "[0-25)", "[25-50)", "[50-75)", "[75-100)", "[100-125)", "[125-150)", "[150-175)", "[175-200)", ">200")) # Reorder the factor levels for weight to represent true ordering

# Need to convert admission_type_id, discharge_disposition_id and admission_source_id each to a factor and add in level names (based on supplementary id_mapping document)

table(db.init$admission_type_id) # Confirmed that all levels (1-8) are present and each have cases
## 
##     1     2     3     4     5     6     7     8 
## 53990 18480 18869    10  4785  5291    21   320
db.init$admission_type_id <- as.factor(db.init$admission_type_id)
levels(db.init$admission_type_id) <- c('Emergency', 'Urgent', 'Elective', 'Newborn', 'NotAvailable', 'NULL', 'TraumaCenter', 'NotMapped')

table(db.init$discharge_disposition_id) # Categories 21, 26, 29, and 30 are missing when compared to id_mapping doc; missing because 0 cases; will not add level names for them
## 
##     1     2     3     4     5     6     7     8     9    10    11    12    13 
## 60234  2128 13954   815  1184 12902   623   108    21     6  1642     3   399 
##    14    15    16    17    18    19    20    22    23    24    25    27    28 
##   372    63    11    14  3691     8     2  1993   412    48   989     5   139
db.init$discharge_disposition_id <- as.factor(db.init$discharge_disposition_id)
levels(db.init$discharge_disposition_id) <- c('DcHome', 'DcAnotherSTHospital', 'DcSNF', 'DcICF', 'DcOtherTypeInpatient', 'DcHomeWithHomeService', 'LeftAMA', 'DcHomeWithIVCare', 'AdmittedInpatientThisHospital', 'NeonateDcOtherHospital', 'Expired', 'StillPatientOrOutpatient', 'HospiceHome', 'HospiceFacility', 'DcWithinSwingBed', 'RefOtherOutpatient', 'RefSameOutpatient', 'NULL', 'ExpiredHome', 'ExpiredFacility', 'DcRehab', 'DcLTCHospital', 'DcMedicaidNursingFacility', 'NotMapped', 'DcFederalHealthFacility','DcPsychiatric')

table(db.init$admission_source_id) # Categories 12, 15, 18, 19, 21, 23, 24, and 26 are missing when compared to id_mapping doc. Note: there is no category 16 in the id_mapping doc, nor is one present in the data
## 
##     1     2     3     4     5     6     7     8     9    10    11    13    14 
## 29565  1104   187  3187   855  2264 57494    16   125     8     2     1     2 
##    17    20    22    25 
##  6781   161    12     2
db.init$admission_source_id <- as.factor(db.init$admission_source_id)
levels(db.init$admission_source_id) <- c('PhysRef', 'ClinicRef', 'HMORef', 'TransferFromHospital', 'TransferFromSNF', 'TransferFromOtherHealthFacility', 'ER', 'CourtOrLawEnf', 'NotAvailable', 'TransferFromCriticalAccessHospital', 'NormalDelivery', 'SickBaby', 'ExtramuralBirth', 'NULL', 'NotMapped', 'SepClaim', 'TransferFromAmbSurg')

# No need to check specifically for overlap in levels of diagnoses, as these are all ICD-9-CM codes. Just skimmed the levels to ensure the levels were either ICD codes or '?'
levels(db.init$diag_1)
##   [1] "?"      "10"     "11"     "110"    "112"    "114"    "115"    "117"   
##   [9] "131"    "133"    "135"    "136"    "141"    "142"    "143"    "145"   
##  [17] "146"    "147"    "148"    "149"    "150"    "151"    "152"    "153"   
##  [25] "154"    "155"    "156"    "157"    "158"    "160"    "161"    "162"   
##  [33] "163"    "164"    "170"    "171"    "172"    "173"    "174"    "175"   
##  [41] "179"    "180"    "182"    "183"    "184"    "185"    "187"    "188"   
##  [49] "189"    "191"    "192"    "193"    "194"    "195"    "196"    "197"   
##  [57] "198"    "199"    "200"    "201"    "202"    "203"    "204"    "205"   
##  [65] "207"    "208"    "210"    "211"    "212"    "214"    "215"    "216"   
##  [73] "217"    "218"    "219"    "220"    "223"    "225"    "226"    "227"   
##  [81] "228"    "229"    "23"     "230"    "233"    "235"    "236"    "237"   
##  [89] "238"    "239"    "240"    "241"    "242"    "244"    "245"    "246"   
##  [97] "250"    "250.01" "250.02" "250.03" "250.1"  "250.11" "250.12" "250.13"
## [105] "250.2"  "250.21" "250.22" "250.23" "250.3"  "250.31" "250.32" "250.33"
## [113] "250.4"  "250.41" "250.42" "250.43" "250.5"  "250.51" "250.52" "250.53"
## [121] "250.6"  "250.7"  "250.8"  "250.81" "250.82" "250.83" "250.9"  "250.91"
## [129] "250.92" "250.93" "251"    "252"    "253"    "255"    "261"    "262"   
## [137] "263"    "266"    "27"     "271"    "272"    "273"    "274"    "275"   
## [145] "276"    "277"    "278"    "279"    "280"    "281"    "282"    "283"   
## [153] "284"    "285"    "286"    "287"    "288"    "289"    "290"    "291"   
## [161] "292"    "293"    "294"    "295"    "296"    "297"    "298"    "299"   
## [169] "3"      "300"    "301"    "303"    "304"    "305"    "306"    "307"   
## [177] "308"    "309"    "31"     "310"    "311"    "312"    "314"    "318"   
## [185] "320"    "322"    "323"    "324"    "325"    "327"    "331"    "332"   
## [193] "333"    "334"    "335"    "336"    "337"    "338"    "34"     "340"   
## [201] "341"    "342"    "344"    "345"    "346"    "347"    "348"    "349"   
## [209] "35"     "350"    "351"    "352"    "353"    "354"    "355"    "356"   
## [217] "357"    "358"    "359"    "36"     "360"    "361"    "362"    "363"   
## [225] "365"    "366"    "368"    "369"    "370"    "372"    "373"    "374"   
## [233] "375"    "376"    "377"    "378"    "379"    "38"     "380"    "381"   
## [241] "382"    "383"    "384"    "385"    "386"    "388"    "389"    "39"    
## [249] "391"    "394"    "395"    "396"    "397"    "398"    "401"    "402"   
## [257] "403"    "404"    "405"    "41"     "410"    "411"    "412"    "413"   
## [265] "414"    "415"    "416"    "417"    "42"     "420"    "421"    "422"   
## [273] "423"    "424"    "425"    "426"    "427"    "428"    "429"    "430"   
## [281] "431"    "432"    "433"    "434"    "435"    "436"    "437"    "438"   
## [289] "440"    "441"    "442"    "443"    "444"    "445"    "446"    "447"   
## [297] "448"    "451"    "452"    "453"    "454"    "455"    "456"    "457"   
## [305] "458"    "459"    "461"    "462"    "463"    "464"    "465"    "466"   
## [313] "47"     "470"    "471"    "473"    "474"    "475"    "477"    "478"   
## [321] "48"     "480"    "481"    "482"    "483"    "485"    "486"    "487"   
## [329] "49"     "490"    "491"    "492"    "493"    "494"    "495"    "496"   
## [337] "5"      "500"    "501"    "506"    "507"    "508"    "510"    "511"   
## [345] "512"    "513"    "514"    "515"    "516"    "518"    "519"    "52"    
## [353] "521"    "522"    "523"    "524"    "526"    "527"    "528"    "529"   
## [361] "53"     "530"    "531"    "532"    "533"    "534"    "535"    "536"   
## [369] "537"    "54"     "540"    "541"    "542"    "543"    "550"    "551"   
## [377] "552"    "553"    "555"    "556"    "557"    "558"    "560"    "562"   
## [385] "564"    "565"    "566"    "567"    "568"    "569"    "57"     "570"   
## [393] "571"    "572"    "573"    "574"    "575"    "576"    "577"    "578"   
## [401] "579"    "58"     "580"    "581"    "582"    "583"    "584"    "585"   
## [409] "586"    "588"    "590"    "591"    "592"    "593"    "594"    "595"   
## [417] "596"    "598"    "599"    "600"    "601"    "602"    "603"    "604"   
## [425] "605"    "607"    "608"    "61"     "610"    "611"    "614"    "615"   
## [433] "616"    "617"    "618"    "619"    "620"    "621"    "622"    "623"   
## [441] "625"    "626"    "627"    "632"    "633"    "634"    "637"    "640"   
## [449] "641"    "642"    "643"    "644"    "645"    "646"    "647"    "648"   
## [457] "649"    "652"    "653"    "654"    "655"    "656"    "657"    "658"   
## [465] "659"    "66"     "660"    "661"    "663"    "664"    "665"    "669"   
## [473] "671"    "674"    "680"    "681"    "682"    "683"    "684"    "685"   
## [481] "686"    "690"    "691"    "692"    "693"    "694"    "695"    "696"   
## [489] "698"    "7"      "70"     "700"    "703"    "704"    "705"    "706"   
## [497] "707"    "708"    "709"    "710"    "711"    "714"    "715"    "716"   
## [505] "717"    "718"    "719"    "720"    "721"    "722"    "723"    "724"   
## [513] "725"    "726"    "727"    "728"    "729"    "730"    "731"    "732"   
## [521] "733"    "734"    "735"    "736"    "737"    "738"    "745"    "746"   
## [529] "747"    "75"     "751"    "753"    "756"    "759"    "78"     "780"   
## [537] "781"    "782"    "783"    "784"    "785"    "786"    "787"    "788"   
## [545] "789"    "79"     "790"    "791"    "792"    "793"    "794"    "795"   
## [553] "796"    "797"    "799"    "8"      "800"    "801"    "802"    "803"   
## [561] "804"    "805"    "806"    "807"    "808"    "810"    "812"    "813"   
## [569] "814"    "815"    "816"    "817"    "82"     "820"    "821"    "822"   
## [577] "823"    "824"    "825"    "826"    "827"    "831"    "832"    "833"   
## [585] "834"    "835"    "836"    "837"    "838"    "839"    "84"     "840"   
## [593] "842"    "843"    "844"    "845"    "846"    "847"    "848"    "850"   
## [601] "851"    "852"    "853"    "854"    "860"    "861"    "862"    "863"   
## [609] "864"    "865"    "866"    "867"    "868"    "870"    "871"    "873"   
## [617] "875"    "878"    "879"    "88"     "880"    "881"    "882"    "883"   
## [625] "885"    "886"    "890"    "891"    "892"    "893"    "895"    "897"   
## [633] "9"      "903"    "904"    "906"    "911"    "913"    "914"    "915"   
## [641] "916"    "917"    "919"    "920"    "921"    "922"    "923"    "924"   
## [649] "928"    "933"    "934"    "935"    "936"    "939"    "94"     "941"   
## [657] "942"    "944"    "945"    "952"    "955"    "957"    "958"    "959"   
## [665] "962"    "963"    "964"    "965"    "966"    "967"    "968"    "969"   
## [673] "97"     "970"    "971"    "972"    "973"    "974"    "975"    "976"   
## [681] "977"    "98"     "980"    "982"    "983"    "986"    "987"    "988"   
## [689] "989"    "990"    "991"    "992"    "994"    "995"    "996"    "997"   
## [697] "998"    "999"    "E909"   "V07"    "V25"    "V26"    "V43"    "V45"   
## [705] "V51"    "V53"    "V54"    "V55"    "V56"    "V57"    "V58"    "V60"   
## [713] "V63"    "V66"    "V67"    "V70"    "V71"
levels(db.init$diag_2)
##   [1] "?"      "11"     "110"    "111"    "112"    "114"    "115"    "117"   
##   [9] "123"    "130"    "131"    "135"    "136"    "137"    "138"    "140"   
##  [17] "141"    "145"    "150"    "151"    "152"    "153"    "154"    "155"   
##  [25] "156"    "157"    "162"    "163"    "164"    "171"    "172"    "173"   
##  [33] "174"    "179"    "180"    "182"    "183"    "185"    "186"    "188"   
##  [41] "189"    "191"    "192"    "193"    "195"    "196"    "197"    "198"   
##  [49] "199"    "200"    "201"    "202"    "203"    "204"    "205"    "208"   
##  [57] "211"    "212"    "214"    "215"    "217"    "218"    "220"    "223"   
##  [65] "225"    "226"    "227"    "228"    "232"    "233"    "235"    "238"   
##  [73] "239"    "240"    "241"    "242"    "244"    "245"    "246"    "250"   
##  [81] "250.01" "250.02" "250.03" "250.1"  "250.11" "250.12" "250.13" "250.2" 
##  [89] "250.21" "250.22" "250.23" "250.3"  "250.31" "250.32" "250.33" "250.4" 
##  [97] "250.41" "250.42" "250.43" "250.5"  "250.51" "250.52" "250.53" "250.6" 
## [105] "250.7"  "250.8"  "250.81" "250.82" "250.83" "250.9"  "250.91" "250.92"
## [113] "250.93" "251"    "252"    "253"    "255"    "256"    "258"    "259"   
## [121] "260"    "261"    "262"    "263"    "266"    "268"    "269"    "27"    
## [129] "270"    "271"    "272"    "273"    "274"    "275"    "276"    "277"   
## [137] "278"    "279"    "280"    "281"    "282"    "283"    "284"    "285"   
## [145] "286"    "287"    "288"    "289"    "290"    "291"    "292"    "293"   
## [153] "294"    "295"    "296"    "297"    "298"    "299"    "300"    "301"   
## [161] "302"    "303"    "304"    "305"    "306"    "307"    "308"    "309"   
## [169] "31"     "310"    "311"    "312"    "314"    "316"    "317"    "318"   
## [177] "319"    "320"    "322"    "323"    "324"    "325"    "327"    "331"   
## [185] "332"    "333"    "335"    "336"    "337"    "338"    "34"     "340"   
## [193] "341"    "342"    "343"    "344"    "345"    "346"    "347"    "348"   
## [201] "349"    "35"     "350"    "351"    "352"    "353"    "354"    "355"   
## [209] "356"    "357"    "358"    "359"    "360"    "362"    "364"    "365"   
## [217] "366"    "368"    "369"    "372"    "373"    "374"    "376"    "377"   
## [225] "378"    "379"    "38"     "380"    "381"    "382"    "383"    "386"   
## [233] "388"    "389"    "394"    "395"    "396"    "397"    "398"    "40"    
## [241] "401"    "402"    "403"    "404"    "405"    "41"     "410"    "411"   
## [249] "412"    "413"    "414"    "415"    "416"    "42"     "420"    "421"   
## [257] "422"    "423"    "424"    "425"    "426"    "427"    "428"    "429"   
## [265] "430"    "431"    "432"    "433"    "434"    "435"    "436"    "437"   
## [273] "438"    "440"    "441"    "442"    "443"    "444"    "446"    "447"   
## [281] "448"    "451"    "452"    "453"    "454"    "455"    "456"    "457"   
## [289] "458"    "459"    "46"     "460"    "461"    "462"    "463"    "464"   
## [297] "465"    "466"    "470"    "472"    "473"    "474"    "475"    "477"   
## [305] "478"    "480"    "481"    "482"    "483"    "484"    "485"    "486"   
## [313] "487"    "490"    "491"    "492"    "493"    "494"    "495"    "496"   
## [321] "5"      "500"    "501"    "506"    "507"    "508"    "510"    "511"   
## [329] "512"    "513"    "514"    "515"    "516"    "517"    "518"    "519"   
## [337] "52"     "520"    "521"    "522"    "523"    "524"    "527"    "528"   
## [345] "529"    "53"     "530"    "531"    "532"    "533"    "534"    "535"   
## [353] "536"    "537"    "54"     "540"    "542"    "543"    "550"    "552"   
## [361] "553"    "555"    "556"    "557"    "558"    "560"    "562"    "564"   
## [369] "565"    "566"    "567"    "568"    "569"    "570"    "571"    "572"   
## [377] "573"    "574"    "575"    "576"    "577"    "578"    "579"    "580"   
## [385] "581"    "583"    "584"    "585"    "586"    "588"    "590"    "591"   
## [393] "592"    "593"    "594"    "595"    "596"    "598"    "599"    "600"   
## [401] "601"    "602"    "603"    "604"    "605"    "607"    "608"    "610"   
## [409] "611"    "614"    "615"    "616"    "617"    "618"    "619"    "620"   
## [417] "621"    "622"    "623"    "625"    "626"    "627"    "634"    "641"   
## [425] "642"    "644"    "645"    "646"    "647"    "648"    "649"    "652"   
## [433] "654"    "656"    "658"    "659"    "66"     "661"    "663"    "664"   
## [441] "665"    "670"    "674"    "680"    "681"    "682"    "683"    "684"   
## [449] "685"    "686"    "691"    "692"    "693"    "694"    "695"    "696"   
## [457] "698"    "7"      "70"     "701"    "702"    "703"    "704"    "705"   
## [465] "706"    "707"    "709"    "710"    "711"    "712"    "713"    "714"   
## [473] "715"    "716"    "717"    "718"    "719"    "721"    "722"    "723"   
## [481] "724"    "725"    "726"    "727"    "728"    "729"    "730"    "731"   
## [489] "733"    "734"    "736"    "737"    "738"    "741"    "742"    "745"   
## [497] "746"    "747"    "748"    "75"     "750"    "751"    "752"    "753"   
## [505] "754"    "755"    "756"    "758"    "759"    "78"     "780"    "781"   
## [513] "782"    "783"    "784"    "785"    "786"    "787"    "788"    "789"   
## [521] "79"     "790"    "791"    "792"    "793"    "794"    "795"    "796"   
## [529] "797"    "799"    "8"      "800"    "801"    "802"    "805"    "806"   
## [537] "807"    "808"    "810"    "811"    "812"    "813"    "814"    "815"   
## [545] "816"    "820"    "821"    "822"    "823"    "824"    "825"    "826"   
## [553] "831"    "832"    "833"    "836"    "837"    "840"    "842"    "843"   
## [561] "844"    "845"    "846"    "847"    "850"    "851"    "852"    "853"   
## [569] "860"    "861"    "862"    "863"    "864"    "865"    "866"    "867"   
## [577] "868"    "869"    "870"    "871"    "872"    "873"    "879"    "88"    
## [585] "880"    "881"    "882"    "883"    "884"    "891"    "892"    "893"   
## [593] "894"    "9"      "905"    "906"    "907"    "908"    "909"    "910"   
## [601] "911"    "912"    "913"    "915"    "916"    "917"    "918"    "919"   
## [609] "920"    "921"    "922"    "923"    "924"    "927"    "933"    "934"   
## [617] "94"     "942"    "944"    "945"    "947"    "948"    "952"    "953"   
## [625] "955"    "958"    "959"    "96"     "962"    "963"    "965"    "967"   
## [633] "968"    "969"    "972"    "974"    "975"    "977"    "980"    "987"   
## [641] "989"    "99"     "990"    "991"    "992"    "994"    "995"    "996"   
## [649] "997"    "998"    "999"    "E812"   "E813"   "E814"   "E816"   "E817"  
## [657] "E818"   "E819"   "E821"   "E826"   "E829"   "E849"   "E850"   "E853"  
## [665] "E854"   "E858"   "E868"   "E870"   "E878"   "E879"   "E880"   "E881"  
## [673] "E882"   "E883"   "E884"   "E885"   "E887"   "E888"   "E890"   "E900"  
## [681] "E905"   "E906"   "E915"   "E916"   "E917"   "E918"   "E919"   "E924"  
## [689] "E927"   "E928"   "E929"   "E930"   "E931"   "E932"   "E933"   "E934"  
## [697] "E935"   "E936"   "E937"   "E938"   "E939"   "E941"   "E942"   "E944"  
## [705] "E945"   "E947"   "E950"   "E965"   "E968"   "E980"   "V02"    "V03"   
## [713] "V08"    "V09"    "V10"    "V11"    "V12"    "V13"    "V14"    "V15"   
## [721] "V16"    "V17"    "V18"    "V23"    "V25"    "V42"    "V43"    "V44"   
## [729] "V45"    "V46"    "V49"    "V50"    "V53"    "V54"    "V55"    "V57"   
## [737] "V58"    "V60"    "V61"    "V62"    "V63"    "V64"    "V65"    "V66"   
## [745] "V69"    "V70"    "V72"    "V85"    "V86"
levels(db.init$diag_3)
##   [1] "?"      "11"     "110"    "111"    "112"    "115"    "117"    "122"   
##   [9] "123"    "131"    "132"    "135"    "136"    "138"    "139"    "14"    
##  [17] "141"    "146"    "148"    "150"    "151"    "152"    "153"    "154"   
##  [25] "155"    "156"    "157"    "158"    "161"    "162"    "163"    "164"   
##  [33] "17"     "170"    "171"    "172"    "173"    "174"    "175"    "179"   
##  [41] "180"    "182"    "183"    "185"    "186"    "188"    "189"    "191"   
##  [49] "192"    "193"    "195"    "196"    "197"    "198"    "199"    "200"   
##  [57] "201"    "202"    "203"    "204"    "205"    "208"    "211"    "214"   
##  [65] "215"    "216"    "217"    "218"    "220"    "223"    "225"    "226"   
##  [73] "227"    "228"    "230"    "233"    "235"    "236"    "238"    "239"   
##  [81] "240"    "241"    "242"    "243"    "244"    "245"    "246"    "250"   
##  [89] "250.01" "250.02" "250.03" "250.1"  "250.11" "250.12" "250.13" "250.2" 
##  [97] "250.21" "250.22" "250.23" "250.3"  "250.31" "250.4"  "250.41" "250.42"
## [105] "250.43" "250.5"  "250.51" "250.52" "250.53" "250.6"  "250.7"  "250.8" 
## [113] "250.81" "250.82" "250.83" "250.9"  "250.91" "250.92" "250.93" "251"   
## [121] "252"    "253"    "255"    "256"    "258"    "259"    "260"    "261"   
## [129] "262"    "263"    "265"    "266"    "268"    "27"     "270"    "271"   
## [137] "272"    "273"    "274"    "275"    "276"    "277"    "278"    "279"   
## [145] "280"    "281"    "282"    "283"    "284"    "285"    "286"    "287"   
## [153] "288"    "289"    "290"    "291"    "292"    "293"    "294"    "295"   
## [161] "296"    "297"    "298"    "299"    "3"      "300"    "301"    "303"   
## [169] "304"    "305"    "306"    "307"    "308"    "309"    "310"    "311"   
## [177] "312"    "313"    "314"    "315"    "317"    "318"    "319"    "323"   
## [185] "327"    "331"    "332"    "333"    "334"    "335"    "336"    "337"   
## [193] "338"    "34"     "340"    "341"    "342"    "343"    "344"    "345"   
## [201] "346"    "347"    "348"    "349"    "35"     "350"    "351"    "353"   
## [209] "354"    "355"    "356"    "357"    "358"    "359"    "360"    "361"   
## [217] "362"    "365"    "365.44" "366"    "368"    "369"    "370"    "372"   
## [225] "373"    "374"    "376"    "377"    "378"    "379"    "38"     "380"   
## [233] "381"    "382"    "383"    "384"    "385"    "386"    "387"    "388"   
## [241] "389"    "391"    "394"    "395"    "396"    "397"    "398"    "401"   
## [249] "402"    "403"    "404"    "405"    "41"     "410"    "411"    "412"   
## [257] "413"    "414"    "415"    "416"    "417"    "42"     "420"    "421"   
## [265] "423"    "424"    "425"    "426"    "427"    "428"    "429"    "430"   
## [273] "431"    "432"    "433"    "434"    "435"    "436"    "437"    "438"   
## [281] "440"    "441"    "442"    "443"    "444"    "445"    "446"    "447"   
## [289] "448"    "451"    "452"    "453"    "454"    "455"    "456"    "457"   
## [297] "458"    "459"    "460"    "461"    "462"    "463"    "464"    "465"   
## [305] "466"    "47"     "470"    "472"    "473"    "475"    "477"    "478"   
## [313] "480"    "481"    "482"    "483"    "484"    "485"    "486"    "487"   
## [321] "49"     "490"    "491"    "492"    "493"    "494"    "495"    "496"   
## [329] "5"      "500"    "501"    "506"    "507"    "508"    "510"    "511"   
## [337] "512"    "514"    "515"    "516"    "517"    "518"    "519"    "521"   
## [345] "522"    "523"    "524"    "525"    "527"    "528"    "529"    "53"    
## [353] "530"    "531"    "532"    "533"    "534"    "535"    "536"    "537"   
## [361] "538"    "54"     "540"    "542"    "543"    "550"    "552"    "553"   
## [369] "555"    "556"    "557"    "558"    "560"    "562"    "564"    "565"   
## [377] "566"    "567"    "568"    "569"    "57"     "570"    "571"    "572"   
## [385] "573"    "574"    "575"    "576"    "577"    "578"    "579"    "580"   
## [393] "581"    "582"    "583"    "584"    "585"    "586"    "588"    "590"   
## [401] "591"    "592"    "593"    "594"    "595"    "596"    "597"    "598"   
## [409] "599"    "600"    "601"    "602"    "603"    "604"    "605"    "607"   
## [417] "608"    "610"    "611"    "614"    "616"    "617"    "618"    "619"   
## [425] "620"    "621"    "622"    "623"    "624"    "625"    "626"    "627"   
## [433] "641"    "642"    "643"    "644"    "646"    "647"    "648"    "649"   
## [441] "652"    "653"    "654"    "655"    "656"    "657"    "658"    "659"   
## [449] "66"     "660"    "661"    "663"    "664"    "665"    "669"    "670"   
## [457] "671"    "674"    "680"    "681"    "682"    "684"    "685"    "686"   
## [465] "690"    "692"    "693"    "694"    "695"    "696"    "697"    "698"   
## [473] "7"      "70"     "701"    "702"    "703"    "704"    "705"    "706"   
## [481] "707"    "708"    "709"    "710"    "711"    "712"    "713"    "714"   
## [489] "715"    "716"    "717"    "718"    "719"    "720"    "721"    "722"   
## [497] "723"    "724"    "725"    "726"    "727"    "728"    "729"    "730"   
## [505] "731"    "732"    "733"    "734"    "735"    "736"    "737"    "738"   
## [513] "741"    "742"    "744"    "745"    "746"    "747"    "75"     "750"   
## [521] "751"    "752"    "753"    "754"    "755"    "756"    "757"    "758"   
## [529] "759"    "78"     "780"    "781"    "782"    "783"    "784"    "785"   
## [537] "786"    "787"    "788"    "789"    "79"     "790"    "791"    "792"   
## [545] "793"    "794"    "795"    "796"    "797"    "799"    "8"      "800"   
## [553] "801"    "802"    "805"    "807"    "808"    "810"    "811"    "812"   
## [561] "813"    "814"    "815"    "816"    "820"    "821"    "822"    "823"   
## [569] "824"    "825"    "826"    "831"    "834"    "836"    "837"    "838"   
## [577] "840"    "841"    "842"    "844"    "845"    "847"    "848"    "850"   
## [585] "851"    "852"    "853"    "854"    "860"    "861"    "862"    "863"   
## [593] "864"    "865"    "866"    "867"    "868"    "870"    "871"    "872"   
## [601] "873"    "875"    "876"    "877"    "879"    "88"     "880"    "881"   
## [609] "882"    "883"    "884"    "890"    "891"    "892"    "893"    "9"     
## [617] "905"    "906"    "907"    "908"    "909"    "910"    "911"    "912"   
## [625] "913"    "915"    "916"    "917"    "918"    "919"    "920"    "921"   
## [633] "922"    "923"    "924"    "928"    "930"    "933"    "934"    "935"   
## [641] "94"     "942"    "943"    "944"    "945"    "948"    "951"    "952"   
## [649] "953"    "955"    "956"    "958"    "959"    "962"    "965"    "966"   
## [657] "967"    "969"    "970"    "971"    "972"    "980"    "987"    "989"   
## [665] "991"    "992"    "995"    "996"    "997"    "998"    "999"    "E812"  
## [673] "E813"   "E815"   "E816"   "E817"   "E818"   "E819"   "E822"   "E825"  
## [681] "E826"   "E828"   "E849"   "E850"   "E852"   "E853"   "E854"   "E855"  
## [689] "E858"   "E861"   "E864"   "E865"   "E870"   "E876"   "E878"   "E879"  
## [697] "E880"   "E881"   "E882"   "E883"   "E884"   "E885"   "E886"   "E887"  
## [705] "E888"   "E892"   "E894"   "E900"   "E901"   "E904"   "E905"   "E906"  
## [713] "E912"   "E915"   "E916"   "E917"   "E919"   "E920"   "E922"   "E924"  
## [721] "E927"   "E928"   "E929"   "E930"   "E931"   "E932"   "E933"   "E934"  
## [729] "E935"   "E936"   "E937"   "E938"   "E939"   "E941"   "E942"   "E943"  
## [737] "E944"   "E945"   "E946"   "E947"   "E949"   "E950"   "E955"   "E956"  
## [745] "E965"   "E966"   "E980"   "E987"   "V01"    "V02"    "V03"    "V06"   
## [753] "V07"    "V08"    "V09"    "V10"    "V11"    "V12"    "V13"    "V14"   
## [761] "V15"    "V16"    "V17"    "V18"    "V22"    "V23"    "V25"    "V27"   
## [769] "V42"    "V43"    "V44"    "V45"    "V46"    "V49"    "V53"    "V54"   
## [777] "V55"    "V57"    "V58"    "V60"    "V61"    "V62"    "V63"    "V64"   
## [785] "V65"    "V66"    "V70"    "V72"    "V85"    "V86"

All remaining attributes are the correct data types.

Remove Out of Scope Cases

If there are multiple encounters for the same patient, we want to remove any after the first one. This is because we want to ensure observations are independent, so that we may run statistical tests on them during analysis and not violate the assumption of independence. We are interested in the the first encounter of each patient, as we are focused on early identification of the risks of readmission.

Additionally, cases where the patient died, or were sent to hospice for end-of-life care should be excluded, as there is no chance of readmission.

length(unique(db.init$encounter_id)) # There are 101766 unique encounters, which is the total number of observations in the dataset
## [1] 101766
length(unique(db.init$patient_nbr)) # There are only 71518 unique patient IDs, indicating that a number of patients have 2 or more encounters
## [1] 71518
db.init <- db.init[order(db.init$encounter_id),] # Sort patient encounters in ascending order; lower numbers indicate earlier encounters
db.init <- db.init[!duplicated(db.init$patient_nbr), ] # Remove rows with duplicated patient IDs; the first instance (with the lower encounter number) will be kept, with any later instances removed
dim(db.init) # Confirmed the resulting data frame has 50 attributes and 71518 observations, as expected
## [1] 71518    50
db.init <- subset(db.init, discharge_disposition_id != 'Expired' & discharge_disposition_id != 'ExpiredHome' & discharge_disposition_id != 'ExpiredFacility' & discharge_disposition_id != 'HospiceHome' & discharge_disposition_id != 'HospiceFacility') # Remove cases where the patient died or was sent to hospice
dim(db.init) # There are now 69973 observations for 50 attributes. Note that in the original research article, they had the same criteria for case removal, but were left with 69984 cases (difference of 11 cases) - unsure at this time as to the reason
## [1] 69973    50

Univariate Analysis with Attribute and Factor Level Reduction

Now that all attributes have the correct data types, let’s review the updated structure.

str(db.init)
## 'data.frame':    69973 obs. of  50 variables:
##  $ encounter_id            : int  12522 15738 16680 28236 35754 36900 40926 42570 55842 62256 ...
##  $ patient_nbr             : int  48330783 63555939 42519267 89869032 82637451 77391171 85504905 77586282 84259809 49726791 ...
##  $ race                    : Factor w/ 6 levels "?","AfricanAmerican",..: 4 4 4 2 4 2 4 4 4 2 ...
##  $ gender                  : Factor w/ 3 levels "Female","Male",..: 1 1 2 1 2 2 1 2 2 1 ...
##  $ age                     : Factor w/ 10 levels "[0-10)","[10-20)",..: 9 10 5 5 6 7 5 9 7 7 ...
##  $ weight                  : Factor w/ 10 levels "?","[0-25)","[25-50)",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ admission_type_id       : Factor w/ 8 levels "Emergency","Urgent",..: 2 3 1 1 2 2 1 1 3 3 ...
##  $ discharge_disposition_id: Factor w/ 26 levels "DcHome","DcAnotherSTHospital",..: 1 3 1 1 1 1 3 6 1 1 ...
##  $ admission_source_id     : Factor w/ 17 levels "PhysRef","ClinicRef",..: 4 4 7 7 2 4 7 7 2 2 ...
##  $ time_in_hospital        : int  13 12 1 9 3 7 7 10 4 1 ...
##  $ payer_code              : Factor w/ 18 levels "?","BC","CH",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ medical_specialty       : Factor w/ 73 levels "?","AllergyandImmunology",..: 1 20 1 1 1 1 13 13 1 1 ...
##  $ num_lab_procedures      : int  68 33 51 47 31 62 60 55 70 49 ...
##  $ num_procedures          : int  2 3 0 2 6 0 0 1 1 5 ...
##  $ num_medications         : int  28 18 8 17 16 11 15 31 21 2 ...
##  $ number_outpatient       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_emergency        : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ number_inpatient        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diag_1                  : Factor w/ 717 levels "?","10","11",..: 254 284 56 122 265 28 278 278 265 350 ...
##  $ diag_2                  : Factor w/ 749 levels "?","11","110",..: 262 48 26 243 248 147 99 248 248 650 ...
##  $ diag_3                  : Factor w/ 790 levels "?","11","110",..: 231 319 88 668 88 53 110 269 772 432 ...
##  $ number_diagnoses        : int  8 8 5 9 9 7 8 8 7 8 ...
##  $ max_glu_serum           : Factor w/ 4 levels ">200",">300",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ A1Cresult               : Factor w/ 4 levels ">7",">8","None",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ metformin               : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 3 2 3 2 ...
##  $ repaglinide             : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 4 2 2 2 ...
##  $ nateglinide             : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ chlorpropamide          : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ glimepiride             : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 3 2 ...
##  $ acetohexamide           : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ glipizide               : Factor w/ 4 levels "Down","No","Steady",..: 3 2 3 2 2 2 2 2 2 2 ...
##  $ glyburide               : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 4 2 2 2 2 ...
##  $ tolbutamide             : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ pioglitazone            : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ rosiglitazone           : Factor w/ 4 levels "Down","No","Steady",..: 2 3 2 2 2 2 2 2 2 2 ...
##  $ acarbose                : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ miglitol                : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ troglitazone            : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ tolazamide              : Factor w/ 3 levels "No","Steady",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ examide                 : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1 ...
##  $ citoglipton             : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1 ...
##  $ insulin                 : Factor w/ 4 levels "Down","No","Steady",..: 3 3 3 3 3 3 1 3 3 3 ...
##  $ glyburide.metformin     : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ glipizide.metformin     : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ glimepiride.pioglitazone: Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ metformin.rosiglitazone : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ metformin.pioglitazone  : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
##  $ change                  : Factor w/ 2 levels "Ch","No": 1 1 1 2 2 1 1 2 1 2 ...
##  $ diabetesMed             : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ readmitted              : Factor w/ 3 levels "<30",">30","NO": 3 3 3 2 2 1 1 3 3 2 ...

Numeric Attributes

First we will create a custom function to calculate Tukey fences so that we may identify outliers.

fences <- function(x){
  y1 <- unname(summary(x)[2]) - 1.5*(IQR(x)) 
  y2 <- unname(summary(x)[5]) + 1.5*(IQR(x))
  z <- c(y1, y2)
  return(z)
}

Then we will go through each numeric attribute individually to review the summary statistics, distribution, and outliers. We already know there are no missing values for the numeric attributes.

Encounter ID and Patient Number

length(unique(db.init$patient_nbr)) # Now that the number of unique patient ids matches the number of total observations (69973), there is no further need for the patient number or encounter id attributes. They are not relevant to any analysis, and will be removed for model creation
## [1] 69973

Time in Hospital

summary(db.init$time_in_hospital) # Ranges from 1-14, skewed to the right with a median of 3, mean of 4.273
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.273   6.000  14.000
sd(db.init$time_in_hospital) # Standard deviation of 2.934
## [1] 2.933924
boxplot(db.init$time_in_hospital, horizontal = T, col = '#83afe6', xlab = 'Time in Hospital') # Visible outliers above upper fence

fences(db.init$time_in_hospital) # Lower fence is -4 (below the min), and upper fence is 12
## [1] -4 12
length(subset(db.init$time_in_hospital, db.init$time_in_hospital > 12)) # 1403 records have a length of stay higher than the upper fence
## [1] 1403
hist.time_in_hospital <- ggplot(db.init, aes(time_in_hospital)) + geom_histogram(binwidth = 1, color='black', fill = '#83afe6') + theme_bw() + labs(x = 'Time in Hospital', y = 'Count') + theme(axis.title.y = element_blank()) 
hist.time_in_hospital # Distribution is visibly skewed to the right

Number of Lab Procedures

summary(db.init$num_lab_procedures) # Ranges from 1-132, with a median of 44, mean of 42.88
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   31.00   44.00   42.88   57.00  132.00
sd(db.init$num_lab_procedures) # Standard deviation of 19.895
## [1] 19.89453
boxplot(db.init$num_lab_procedures, horizontal = T, col = '#83afe6', xlab = 'No. Lab Procedures') # Visible outliers above upper fence

fences(db.init$num_lab_procedures) # Lower fence is -8 (below the min), and upper fence is 96
## [1] -8 96
length(subset(db.init$num_lab_procedures, db.init$num_lab_procedures > 96)) # 98 records have a number of lab procedures higher than the upper fence
## [1] 98
hist.num_lab_procedures <- ggplot(db.init, aes(num_lab_procedures)) + geom_histogram(binwidth = 2, color='black', fill = '#83afe6') + theme_bw() + labs(x = 'No. Lab Procedures', y = 'Count') + theme(axis.title.y = element_blank()) 
hist.num_lab_procedures # Distribution is unimodal except for large number of values at 1, with outliers visible to the right

length(subset(db.init$num_lab_procedures, db.init$num_lab_procedures == 1)) # Of note, 2254 patients had only one lab procedure done
## [1] 2254

Number of Procedures

summary(db.init$num_procedures) # Ranges from 0-6, with a median of 1, mean of 1.426
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   1.426   2.000   6.000
sd(db.init$num_procedures) # Standard deviation of 1.757
## [1] 1.757131
boxplot(db.init$num_procedures, horizontal = T, col = '#83afe6', xlab = 'No. Procedures') # Visible outlier(s) above upper fence

fences(db.init$num_procedures) # Lower fence is -3 (below the min), and upper fence is 5
## [1] -3  5
length(subset(db.init$num_procedures, db.init$num_procedures > 5)) # 3845 records have a number of lab procedures higher than the upper fence. However, the only value above the upper fence is 6, so these are all patients who had 6 lab procedures done
## [1] 3845
hist.num_procedures <- ggplot(db.init, aes(num_procedures)) + geom_histogram(binwidth = 1, color='black', fill = '#83afe6') + theme_bw() + labs(x = 'No. Procedures', y = 'Count') + theme(axis.title.y = element_blank()) 
hist.num_procedures # Distribution is skewed right

Number of Medications

summary(db.init$num_medications) # Ranges from 1-81, with a median of 14, mean of 15.67
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   10.00   14.00   15.67   20.00   81.00
sd(db.init$num_medications) # Standard deviation of 8.287
## [1] 8.287246
boxplot(db.init$num_medications, horizontal = T, col = '#83afe6', xlab = 'No. Medications') # Many visible outliers above upper fence

fences(db.init$num_medications) # Lower fence is -5 (below the min), and upper fence is 35
## [1] -5 35
length(subset(db.init$num_medications, db.init$num_medications > 35)) # 1867 records have a number of medications higher than the upper fence
## [1] 1867
hist.num_medications <- ggplot(db.init, aes(num_medications)) + geom_histogram(binwidth = 2, color='black', fill = '#83afe6') + theme_bw() + labs(x = 'No. Medications', y = 'Count') + theme(axis.title.y = element_blank())
hist.num_medications # Distribution is unimodal and skewed slightly right

Number of Outpatient Visits

summary(db.init$number_outpatient) # Ranges from 0-42, with a median of 0, mean of 0.2795
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2795  0.0000 42.0000
sd(db.init$number_outpatient) # Standard deviation of 1.064
## [1] 1.064035
boxplot(db.init$number_outpatient, horizontal = T, col = '#83afe6', xlab = 'No. Outpatient Visits') # Many visible outliers above upper fence

fences(db.init$number_outpatient) # Lower fence is 0, and upper fence is 0
## [1] 0 0
length(subset(db.init$number_outpatient, db.init$number_outpatient != 0)) # 9119 records have a number of outpatient visits higher than the upper fence. I.e., had more than 0 outpatient visits in the last year
## [1] 9119
hist.number_outpatient <- ggplot(db.init, aes(number_outpatient)) + geom_histogram(binwidth = 2, color='black', fill = '#83afe6') + theme_bw() + labs(x = 'No. Outpatient Visits', y = 'Count') + theme(axis.title.y = element_blank())
hist.number_outpatient # Most cases are zero

table(db.init$number_outpatient) # The vast majority of cases (60854) are zero, and the rest of the data is skewed right
## 
##     0     1     2     3     4     5     6     7     8     9    10    11    12 
## 60854  4779  1981  1096   574   278   122    72    56    36    28    18    13 
##    13    14    15    16    17    18    19    20    21    22    23    24    25 
##    13    10    10     7     4     2     1     3     1     2     1     2     1 
##    26    27    29    33    35    36    42 
##     1     2     1     2     1     1     1

Number of Emergency Visits

summary(db.init$number_emergency) # Ranges from 0-42, with a median of 0, mean of 0.1039
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1039  0.0000 42.0000
sd(db.init$number_emergency) # Standard deviation of 0.512
## [1] 0.5118705
boxplot(db.init$number_emergency, horizontal = T, col = '#83afe6', xlab = 'No. Emergency Visits') # Many visible outliers above upper fence

fences(db.init$number_emergency) # Lower fence is 0, and upper fence is 0
## [1] 0 0
length(subset(db.init$number_emergency, db.init$number_emergency != 0)) # 5100 records have a number of emergency visits higher than the upper fence. I.e., had more than 0 emergency visits in the last year
## [1] 5100
hist.number_emergency <- ggplot(db.init, aes(number_emergency)) + geom_histogram(binwidth = 2, color='black', fill = '#83afe6') + theme_bw() + labs(x = 'No. Emergency Visits', y = 'Count') + theme(axis.title.y = element_blank())
hist.number_emergency # Most cases are zero

table(db.init$number_emergency) # The vast majority of cases (64873) are zero, and the rest of the data is skewed right
## 
##     0     1     2     3     4     5     6     7     8     9    10    11    13 
## 64873  3882   789   242    95    32    26     8     9     4     5     2     1 
##    16    20    25    37    42 
##     1     1     1     1     1

Number of Inpatient Visits

summary(db.init$number_inpatient) # Ranges from 0-12, with a median of 0, mean of 0.1763
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1763  0.0000 12.0000
sd(db.init$number_inpatient) # Standard deviation of 0.602
## [1] 0.6016572
boxplot(db.init$number_inpatient, horizontal = T, col = '#83afe6', xlab = 'No. Inpatient Visits') # Visible outliers above upper fence

fences(db.init$number_inpatient) # Lower fence is 0, and upper fence is 0
## [1] 0 0
length(subset(db.init$number_inpatient, db.init$number_inpatient != 0)) # 8191 records have a number of inpatient visits higher than the upper fence. I.e., had more than 0 inpatient visits in the last year
## [1] 8191
hist.number_inpatient <- ggplot(db.init, aes(number_inpatient)) + geom_histogram(binwidth = 2, color='black', fill = '#83afe6') + theme_bw() + labs(x = 'No. Inpatient Visits') + theme(axis.title.y = element_blank()) + theme(axis.title.y = element_blank())
hist.number_inpatient # Most casesa are zero

table(db.init$number_inpatient) # The vast majority of cases (61782) are zero, and the rest of the data is skewed right
## 
##     0     1     2     3     4     5     6     7     8     9    10    11    12 
## 61782  5794  1501   463   228   102    55    19    13     7     5     2     2

Number of Diagnoses

summary(db.init$number_diagnoses) # Ranges from 1-16, with a median of 8, mean of 7.224
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   6.000   8.000   7.224   9.000  16.000
sd(db.init$number_diagnoses) # Standard deviation of 2.001
## [1] 2.001354
boxplot(db.init$number_diagnoses, horizontal = T, col = '#83afe6', xlab = 'No. Diagnoses') # Visible outliers above upper fence and below lower fence

fences(db.init$number_diagnoses) # Lower fence is 1.5, and upper fence is 13.5
## [1]  1.5 13.5
length(subset(db.init$number_diagnoses, db.init$number_diagnoses < 1.5)) # 193 cases are below the lower fence. I.e., 193 people had 1 diagnosis, as 1 is the min
## [1] 193
length(subset(db.init$number_diagnoses, db.init$number_diagnoses > 13.5)) # 42 records have a number of diagnoses higher than the upper fence
## [1] 42
hist.number_diagnoses <- ggplot(db.init, aes(number_diagnoses)) + geom_histogram(binwidth = 2, color='black', fill = '#83afe6') + theme_bw() + labs(x = 'No. Diagnoses', y = 'Count') + theme(axis.title.y = element_blank())
hist.number_diagnoses # Skewed left from 1-9

table(db.init$number_diagnoses) # From diagnoses 1-9, the data is skewed left. There are very few cases 10 and above. Since there is no biological reason for this, this finding suggests to me that there is some reason coding for more than 9 diagnoses may be difficult in the system, so many practitioners don't bother for an inpatient visit
## 
##     1     2     3     4     5     6     7     8     9    10    11    12    13 
##   193   881  2357  4442  8881  7566  7485  7391 30704     9     6     6    10 
##    14    15    16 
##     5     7    30
length(subset(db.init$number_diagnoses, db.init$number_diagnoses > 9)) # 73 cases have more than 9 diagnoses
## [1] 73

The following are grouped visualizations of the numeric attributes above, specifically the histograms and boxplots.

grid.arrange(hist.time_in_hospital, hist.num_lab_procedures, hist.num_procedures, hist.num_medications, hist.number_outpatient, hist.number_emergency, hist.number_inpatient, hist.number_diagnoses, nrow = 4) # Histograms together

par(mfrow=c(4,2)) # Boxplots together
boxplot(db.init$time_in_hospital, xlab = 'Time in Hospital', las=1, horizontal = T, col = '#83afe6')
boxplot(db.init$num_lab_procedures, xlab = 'No. Lab Procedures', las=1, horizontal = T, col = '#83afe6')
boxplot(db.init$num_procedures, xlab = 'No. Procedures', las=1, horizontal = T, col = '#83afe6')
boxplot(db.init$num_medications, xlab = 'No. Medications', las=1, horizontal = T, col = '#83afe6')
boxplot(db.init$number_outpatient, xlab = 'No. Outpatient Visits', las=1, horizontal = T, col = '#83afe6')
boxplot(db.init$number_emergency, xlab = 'No. Emergency Visits', las=1, horizontal = T, col = '#83afe6')
boxplot(db.init$number_inpatient, xlab = 'No. Inpatient Visits', las=1, horizontal = T, col = '#83afe6')
boxplot(db.init$number_diagnoses, xlab = 'No. Diagnoses', las=1, horizontal = T, col = '#83afe6')

Now we’ll identify the overall number of outliers for the numeric attributes.

nrow(subset(db.init, db.init$time_in_hospital > 12 | db.init$num_lab_procedures > 96 | db.init$num_procedures > 5 | db.init$num_medications > 35 | db.init$number_outpatient != 0 | db.init$number_emergency != 0 | db.init$number_inpatient != 0 | db.init$number_diagnoses < 1.5 | db.init$number_diagnoses > 13.5)) # There are 23159 rows in which at least one outlier is present
## [1] 23159
23159/69973 # 33.1% of records in the dataset contain one or more outliers; these will be kept as they may all represent real cases based on their values. Additionally, neural networks are somewhat robust to outliers
## [1] 0.3309705

We will transfer the numeric attributes kept to a new clean dataframe (db.expl) for further exploratory analysis.

db.expl <- db.init[,c(10, 13:18, 22)]

Ordinal Attributes

We will go through each ordinal attribute and review frequencies, relative frequencies, missing values, and histograms as appropriate.

Age

table(db.init$age) # Most of cases appear to be grouped around ages 50-90 with the 70-80 group being the single largest (15684 cases); there are no missing values
## 
##   [0-10)  [10-20)  [20-30)  [30-40)  [40-50)  [50-60)  [60-70)  [70-80) 
##      153      534     1121     2692     6828    12349    15684    17750 
##  [80-90) [90-100) 
##    11102     1760
prop.table(table(db.init$age)) # Largest number of cases is 70-80 at 25.4%
## 
##      [0-10)     [10-20)     [20-30)     [30-40)     [40-50)     [50-60) 
## 0.002186558 0.007631515 0.016020465 0.038471982 0.097580495 0.176482357 
##     [60-70)     [70-80)     [80-90)    [90-100) 
## 0.224143598 0.253669272 0.158661198 0.025152559
ggplot(db.init, aes(age)) + geom_bar(color = 'black', fill = '#83afe6') + theme_bw() + labs(x = 'Age', y = 'Count') # Distribution is unimodal and skewed slightly left

db.expl$age <- db.init$age # Add variable to exploration data frame; no changes needed

Weight

sort(prop.table(table(db.init$weight))) # 96.0% of the data is missing. Attribute will be removed for analysis and modeling
## 
##         >200    [175-200)    [150-175)       [0-25)      [25-50)    [125-150) 
## 4.287368e-05 1.286210e-04 4.573193e-04 6.573964e-04 1.171881e-03 1.843568e-03 
##    [100-125)      [50-75)     [75-100)            ? 
## 8.003087e-03 1.078988e-02 1.674932e-02 9.601561e-01

Max Glucose Serum

sort(prop.table(table(db.init$max_glu_serum))) # 95.2% of the time, the test was not taken. This attribute will be removed from analysis due to low variance 
## 
##       >300       >200       Norm       None 
## 0.01017535 0.01337659 0.02429509 0.95215297

Any attribute with one category spanning more than 95% of the data will be removed from analysis and modeling due to low variance.

A1C Result

sort(prop.table(table(db.init$A1Cresult))) # Test not done 81.6% of the time
## 
##         >7       Norm         >8       None 
## 0.04094436 0.05346348 0.08916296 0.81642919
db.expl$A1Cresult <- droplevels(fct_collapse(db.init$A1Cresult, '>7' = c('>7', '>8'))) # Put both abnormal results (>7, >8) in the same category, as they are small samples, and I'm interested in an abnormal VS normal result; there's no medical reason for them to be considered separately. These categories will be compared to the test not being done as its own category. Add updated column to db.expl dataframe for exploration
table(db.expl$A1Cresult)
## 
##    >7  None  Norm 
##  9104 57128  3741
sort(prop.table(table(db.expl$A1Cresult)))
## 
##       Norm         >7       None 
## 0.05346348 0.13010733 0.81642919

A1C Result will be considered a categorical variable going forward, comparing the categories of the test not being done (‘None’), the test done with a normal result (‘Norm’), and the test done with an abnormal result (‘>7’).

Categorical Attributes

We will go through each categorical attribute and review frequencies and relative frequencies. Any attribute with a small number of missing values will have the missing values imputed as the majority class. Categories will be condensed based on domain knowledge, and then any with under 5% of cases will be grouped into an ‘Other’ category. Attributes to be kept will be updated in the db.expl dataframe for further exploratory analysis.

Race

sort(prop.table(table(db.init$race))) # The highest occurring racial group was Caucasian and 74.7%, followed by African American at 18% and then Unknown ('?') at 2.7%
## 
##           Asian           Other        Hispanic               ? AfricanAmerican 
##     0.006974119     0.016434911     0.021436840     0.027410573     0.180426736 
##       Caucasian 
##     0.747316822
db.expl$race <- droplevels(fct_collapse(db.init$race, 'Caucasian' = c('Caucasian', '?'))) # Impute unknown ('?') category as the majority class (Caucasian); will drop any empty levels here too, and add updated column to db.expl dataframe for exploration
table(db.expl$race) # Review frequencies now that Unknown was imputed
## 
##       Caucasian AfricanAmerican           Asian        Hispanic           Other 
##           54210           12625             488            1500            1150
sort(prop.table(table(db.expl$race))) # Now Caucasian represents 77.5% of cases
## 
##           Asian           Other        Hispanic AfricanAmerican       Caucasian 
##     0.006974119     0.016434911     0.021436840     0.180426736     0.774727395

Gender

table(db.init$gender) # Only 3 Unknown/Invalid cases
## 
##          Female            Male Unknown/Invalid 
##           37229           32741               3
db.expl$gender <- droplevels(fct_collapse(db.init$gender, 'Female' = c('Female', 'Unknown/Invalid'))) # Impute Unknown/Invalid category as the majority class (Female)
table(db.expl$gender)
## 
## Female   Male 
##  37232  32741
prop.table(table(db.expl$gender)) #53.2% female, 46.8% male
## 
##    Female      Male 
## 0.5320909 0.4679091

Admission Type ID

sort(prop.table(table(db.init$admission_type_id))) # The majority came in through emergency (50.6%); Collectively 11.3% of cases are Not Mapped, Not Available or NULL. These will be assigned to the majority class (Emergency)
## 
##      Newborn TraumaCenter    NotMapped NotAvailable         NULL       Urgent 
## 0.0001286210 0.0002572421 0.0041587469 0.0441027253 0.0645391794 0.1829562831 
##     Elective    Emergency 
## 0.1970045589 0.5068526432
db.expl$admission_type_id <- droplevels(fct_collapse(db.init$admission_type_id, 'Emergency' = c('Emergency', 'NULL', 'NotAvailable', 'NotMapped'), 'Other' = c('TraumaCenter', 'Newborn'))) # Assign missing values to Emergency; add any categories under 5% of cases to "Other"
table(db.expl$admission_type_id)
## 
## Emergency    Urgent  Elective     Other 
##     43359     12802     13785        27
sort(prop.table(table(db.expl$admission_type_id)))
## 
##        Other       Urgent     Elective    Emergency 
## 0.0003858631 0.1829562831 0.1970045589 0.6196532948

Discharge Disposition ID

table(db.init$discharge_disposition_id) # There are 26 levels, most of which have very few cases
## 
##                        DcHome           DcAnotherSTHospital 
##                         44317                          1539 
##                         DcSNF                         DcICF 
##                          8784                           541 
##          DcOtherTypeInpatient         DcHomeWithHomeService 
##                           913                          8289 
##                       LeftAMA              DcHomeWithIVCare 
##                           409                            73 
## AdmittedInpatientThisHospital        NeonateDcOtherHospital 
##                             9                             6 
##                       Expired      StillPatientOrOutpatient 
##                             0                             2 
##                   HospiceHome               HospiceFacility 
##                             0                             0 
##              DcWithinSwingBed            RefOtherOutpatient 
##                            40                             3 
##             RefSameOutpatient                          NULL 
##                             8                          2474 
##                   ExpiredHome               ExpiredFacility 
##                             0                             0 
##                       DcRehab                 DcLTCHospital 
##                          1410                           260 
##     DcMedicaidNursingFacility                     NotMapped 
##                            25                           778 
##       DcFederalHealthFacility                 DcPsychiatric 
##                             3                            90
sort(prop.table(table(db.init$discharge_disposition_id))) #4.6% of cases unknown between Not Mapped and NULL; will assign to majority class (DcHome)
## 
##                       Expired                   HospiceHome 
##                  0.000000e+00                  0.000000e+00 
##               HospiceFacility                   ExpiredHome 
##                  0.000000e+00                  0.000000e+00 
##               ExpiredFacility      StillPatientOrOutpatient 
##                  0.000000e+00                  2.858245e-05 
##            RefOtherOutpatient       DcFederalHealthFacility 
##                  4.287368e-05                  4.287368e-05 
##        NeonateDcOtherHospital             RefSameOutpatient 
##                  8.574736e-05                  1.143298e-04 
## AdmittedInpatientThisHospital     DcMedicaidNursingFacility 
##                  1.286210e-04                  3.572807e-04 
##              DcWithinSwingBed              DcHomeWithIVCare 
##                  5.716491e-04                  1.043260e-03 
##                 DcPsychiatric                 DcLTCHospital 
##                  1.286210e-03                  3.715719e-03 
##                       LeftAMA                         DcICF 
##                  5.845112e-03                  7.731554e-03 
##                     NotMapped          DcOtherTypeInpatient 
##                  1.111857e-02                  1.304789e-02 
##                       DcRehab           DcAnotherSTHospital 
##                  2.015063e-02                  2.199420e-02 
##                          NULL         DcHomeWithHomeService 
##                  3.535649e-02                  1.184600e-01 
##                         DcSNF                        DcHome 
##                  1.255341e-01                  6.333443e-01
db.expl$discharge_disposition_id <- droplevels(fct_collapse(db.init$discharge_disposition_id, 
  'DcHome' = c('DcHome', 'NULL', 'NotMapped'),
  'DcHomeWithHomeService' = c('DcHomeWithHomeService', 'DcHomeWithIVCare'),
  'DcOtherFacility' = c('DcAnotherSTHospital', 'DcICF', 'DcOtherTypeInpatient', 'NeonateDcOtherHospital', 'DcRehab', 'DcLTCHospital', 'DcMedicaidNursingFacility', 'DcFederalHealthFacility', 'DcPsychiatric'),
  'Other' = c('AdmittedInpatientThisHospital', 'StillPatientOrOutpatient', 'DcWithinSwingBed', 'RefOtherOutpatient', 'RefSameOutpatient', 'LeftAMA'))) # Drop empty levels, and collapse categories based on what logically fits together; add any remaining categories under 5% of cases to "Other"
table(db.expl$discharge_disposition_id)
## 
##                DcHome       DcOtherFacility                 DcSNF 
##                 47569                  4787                  8784 
## DcHomeWithHomeService                 Other 
##                  8362                   471
sort(prop.table(table(db.expl$discharge_disposition_id))) #DcHome now has 68.0% of cases
## 
##                 Other       DcOtherFacility DcHomeWithHomeService 
##           0.006731168           0.068412102           0.119503237 
##                 DcSNF                DcHome 
##           0.125534135           0.679819359

Admission Source ID

table(db.init$admission_source_id) # There are 17 levels, most of which have very few cases
## 
##                            PhysRef                          ClinicRef 
##                              21746                                908 
##                             HMORef               TransferFromHospital 
##                                136                               2530 
##                    TransferFromSNF    TransferFromOtherHealthFacility 
##                                512                               1785 
##                                 ER                      CourtOrLawEnf 
##                              37260                                 11 
##                       NotAvailable TransferFromCriticalAccessHospital 
##                                 95                                  7 
##                     NormalDelivery                           SickBaby 
##                                  1                                  1 
##                    ExtramuralBirth                               NULL 
##                                  2                               4820 
##                          NotMapped                           SepClaim 
##                                153                                  4 
##                TransferFromAmbSurg 
##                                  2
sort(prop.table(table(db.init$admission_source_id))) #7.1% of cases unknown between Not Mapped, Not Available, and NULL; will assign to majority class (ER)
## 
##                     NormalDelivery                           SickBaby 
##                       1.429123e-05                       1.429123e-05 
##                    ExtramuralBirth                TransferFromAmbSurg 
##                       2.858245e-05                       2.858245e-05 
##                           SepClaim TransferFromCriticalAccessHospital 
##                       5.716491e-05                       1.000386e-04 
##                      CourtOrLawEnf                       NotAvailable 
##                       1.572035e-04                       1.357667e-03 
##                             HMORef                          NotMapped 
##                       1.943607e-03                       2.186558e-03 
##                    TransferFromSNF                          ClinicRef 
##                       7.317108e-03                       1.297643e-02 
##    TransferFromOtherHealthFacility               TransferFromHospital 
##                       2.550984e-02                       3.615680e-02 
##                               NULL                            PhysRef 
##                       6.888371e-02                       3.107770e-01 
##                                 ER 
##                       5.324911e-01
db.expl$admission_source_id <- droplevels(fct_collapse(db.init$admission_source_id, 
  'ER' = c('ER', 'NotMapped', 'NotAvailable', 'NULL'),
  'TransferExtFacility' = c('TransferFromHospital', 'TransferFromOtherHealthFacility', 'TransferFromSNF', 'ClinicRef', 'HMORef', 'TransferFromCriticalAccessHospital', 'TransferFromAmbSurg'),
  'Other' = c('CourtOrLawEnf', 'NormalDelivery', 'SickBaby', 'ExtramuralBirth', 'SepClaim'))) # Drop empty levels, and collapse categories based on what logically fits together; add any remaining categories under 5% of cases to "Other"
table(db.expl$admission_source_id)
## 
##             PhysRef TransferExtFacility                  ER               Other 
##               21746                5880               42328                  19
sort(prop.table(table(db.expl$admission_source_id)))
## 
##               Other TransferExtFacility             PhysRef                  ER 
##        0.0002715333        0.0840324125        0.3107770140        0.6049190402

Payer Code

levels(db.init$payer_code) # 18 Levels; unsure of what codes mean as most are not standardized and the original researchers gave no indication
##  [1] "?"  "BC" "CH" "CM" "CP" "DM" "FR" "HM" "MC" "MD" "MP" "OG" "OT" "PO" "SI"
## [16] "SP" "UN" "WC"
sort(prop.table(table(db.init$payer_code))) # Given the attribute is 43.5% missing, and we are unable to determine with any certainty the meanings behind categories, the attribute will not be used for analysis or modeling
## 
##           FR           MP           SI           OT           CH           WC 
## 1.429123e-05 4.573193e-04 5.287754e-04 8.860561e-04 1.614909e-03 1.672074e-03 
##           DM           PO           OG           CM           UN           CP 
## 5.316336e-03 6.531091e-03 9.246424e-03 1.850714e-02 2.651023e-02 2.771069e-02 
##           MD           SP           BC           HM           MC            ? 
## 3.094051e-02 4.720392e-02 4.854730e-02 5.693625e-02 2.827090e-01 4.346677e-01

Medical Specialty

levels(db.init$medical_specialty) # 73 categories
##  [1] "?"                                   
##  [2] "AllergyandImmunology"                
##  [3] "Anesthesiology"                      
##  [4] "Anesthesiology-Pediatric"            
##  [5] "Cardiology"                          
##  [6] "Cardiology-Pediatric"                
##  [7] "DCPTEAM"                             
##  [8] "Dentistry"                           
##  [9] "Dermatology"                         
## [10] "Emergency/Trauma"                    
## [11] "Endocrinology"                       
## [12] "Endocrinology-Metabolism"            
## [13] "Family/GeneralPractice"              
## [14] "Gastroenterology"                    
## [15] "Gynecology"                          
## [16] "Hematology"                          
## [17] "Hematology/Oncology"                 
## [18] "Hospitalist"                         
## [19] "InfectiousDiseases"                  
## [20] "InternalMedicine"                    
## [21] "Nephrology"                          
## [22] "Neurology"                           
## [23] "Neurophysiology"                     
## [24] "Obsterics&Gynecology-GynecologicOnco"
## [25] "Obstetrics"                          
## [26] "ObstetricsandGynecology"             
## [27] "Oncology"                            
## [28] "Ophthalmology"                       
## [29] "Orthopedics"                         
## [30] "Orthopedics-Reconstructive"          
## [31] "Osteopath"                           
## [32] "Otolaryngology"                      
## [33] "OutreachServices"                    
## [34] "Pathology"                           
## [35] "Pediatrics"                          
## [36] "Pediatrics-AllergyandImmunology"     
## [37] "Pediatrics-CriticalCare"             
## [38] "Pediatrics-EmergencyMedicine"        
## [39] "Pediatrics-Endocrinology"            
## [40] "Pediatrics-Hematology-Oncology"      
## [41] "Pediatrics-InfectiousDiseases"       
## [42] "Pediatrics-Neurology"                
## [43] "Pediatrics-Pulmonology"              
## [44] "Perinatology"                        
## [45] "PhysicalMedicineandRehabilitation"   
## [46] "PhysicianNotFound"                   
## [47] "Podiatry"                            
## [48] "Proctology"                          
## [49] "Psychiatry"                          
## [50] "Psychiatry-Addictive"                
## [51] "Psychiatry-Child/Adolescent"         
## [52] "Psychology"                          
## [53] "Pulmonology"                         
## [54] "Radiologist"                         
## [55] "Radiology"                           
## [56] "Resident"                            
## [57] "Rheumatology"                        
## [58] "Speech"                              
## [59] "SportsMedicine"                      
## [60] "Surgeon"                             
## [61] "Surgery-Cardiovascular"              
## [62] "Surgery-Cardiovascular/Thoracic"     
## [63] "Surgery-Colon&Rectal"                
## [64] "Surgery-General"                     
## [65] "Surgery-Maxillofacial"               
## [66] "Surgery-Neuro"                       
## [67] "Surgery-Pediatric"                   
## [68] "Surgery-Plastic"                     
## [69] "Surgery-PlasticwithinHeadandNeck"    
## [70] "Surgery-Thoracic"                    
## [71] "Surgery-Vascular"                    
## [72] "SurgicalSpecialty"                   
## [73] "Urology"
sort(prop.table(table(db.init$medical_specialty))) # 48.1% missing ('?'), and may be related to primary diagnosis (another attribute we have); attribute to be removed for analysis and modeling
## 
##      Pediatrics-AllergyandImmunology        Pediatrics-InfectiousDiseases 
##                         0.000000e+00                         0.000000e+00 
##                          Dermatology                      Neurophysiology 
##                         1.429123e-05                         1.429123e-05 
##                         Perinatology                           Proctology 
##                         1.429123e-05                         1.429123e-05 
##                 Psychiatry-Addictive                             Resident 
##                         1.429123e-05                         1.429123e-05 
##                               Speech                       SportsMedicine 
##                         1.429123e-05                         1.429123e-05 
##     Surgery-PlasticwithinHeadandNeck         Pediatrics-EmergencyMedicine 
##                         1.429123e-05                         4.287368e-05 
##       Pediatrics-Hematology-Oncology                              DCPTEAM 
##                         4.287368e-05                         5.716491e-05 
##                            Dentistry                 AllergyandImmunology 
##                         5.716491e-05                         8.574736e-05 
##                            Pathology               Pediatrics-Pulmonology 
##                         8.574736e-05                         8.574736e-05 
##          Psychiatry-Child/Adolescent                    Surgery-Pediatric 
##                         8.574736e-05                         8.574736e-05 
##                       Anesthesiology                 Cardiology-Pediatric 
##                         1.000386e-04                         1.000386e-04 
##             Endocrinology-Metabolism                 Pediatrics-Neurology 
##                         1.000386e-04                         1.000386e-04 
##                    PhysicianNotFound                Surgery-Maxillofacial 
##                         1.000386e-04                         1.143298e-04 
##                     OutreachServices                 Surgery-Colon&Rectal 
##                         1.286210e-04                         1.286210e-04 
##                         Rheumatology             Anesthesiology-Pediatric 
##                         1.429123e-04                         1.857859e-04 
##                           Obstetrics Obsterics&Gynecology-GynecologicOnco 
##                         2.429509e-04                         2.572421e-04 
##                    SurgicalSpecialty                   InfectiousDiseases 
##                         3.715719e-04                         4.144456e-04 
##                      Surgery-Plastic                           Hematology 
##                         4.144456e-04                         4.430280e-04 
##                        Ophthalmology                          Hospitalist 
##                         5.001929e-04                         5.144842e-04 
##                            Osteopath                            Radiology 
##                         5.287754e-04                         5.430666e-04 
##                              Surgeon                           Psychology 
##                         5.716491e-04                         7.574350e-04 
##                           Gynecology                             Podiatry 
##                         7.717262e-04                         9.003473e-04 
##              Pediatrics-CriticalCare               Surgery-Cardiovascular 
##                         1.043260e-03                         1.214754e-03 
##                     Surgery-Thoracic                        Endocrinology 
##                         1.300502e-03                         1.386249e-03 
##                  Hematology/Oncology                       Otolaryngology 
##                         1.557744e-03                         1.572035e-03 
##             Pediatrics-Endocrinology                            Neurology 
##                         2.100810e-03                         2.386635e-03 
##    PhysicalMedicineandRehabilitation                           Pediatrics 
##                         2.772498e-03                         2.786789e-03 
##                             Oncology                     Surgery-Vascular 
##                         2.929701e-03                         5.130550e-03 
##                     Gastroenterology                        Surgery-Neuro 
##                         5.473540e-03                         5.773656e-03 
##      Surgery-Cardiovascular/Thoracic                              Urology 
##                         6.974119e-03                         7.574350e-03 
##              ObstetricsandGynecology                           Psychiatry 
##                         8.474697e-03                         8.760522e-03 
##                          Pulmonology                           Nephrology 
##                         9.103511e-03                         1.139011e-02 
##                          Radiologist           Orthopedics-Reconstructive 
##                         1.173310e-02                         1.487717e-02 
##                          Orthopedics                      Surgery-General 
##                         1.612050e-02                         3.151215e-02 
##                           Cardiology                     Emergency/Trauma 
##                         6.012319e-02                         6.278136e-02 
##               Family/GeneralPractice                     InternalMedicine 
##                         7.114173e-02                         1.520729e-01 
##                                    ? 
##                         4.807426e-01

Diagnosis 1

length(levels(db.init$diag_1)) # 717 distinct primary diagnoses; a bit cumbersome to analyze
## [1] 717
sort(table(db.init$diag_1)) # Code 414 has the highest number of cases (5209), which is "Other forms of chronic ischemic heart disease". Next is code 428 at 3876 records, which is Heart Failure. 3rd highest is code 786 at 3040 records, which is "Symptoms involving respiratory system and other chest symptoms". Diabetes mellitus appears to have several thousand cases as well, but we can't easily tell as it's broken up into about 10 categories (250, 250.xx)
## 
##     23    271    279    372    373    389    506    523    543     58    698 
##      0      0      0      0      0      0      0      0      0      0      0 
##    731    804    827    895    919     97    973    974    988    V07    V66 
##      0      0      0      0      0      0      0      0      0      0      0 
##     10    114    131    133    143    145    148    160    207    216    217 
##      1      1      1      1      1      1      1      1      1      1      1 
##    219    229 250.51    262     27    299    314    318    325    334    347 
##      1      1      1      1      1      1      1      1      1      1      1 
##    363    365    366    375    381    391    412    448    471    477    500 
##      1      1      1      1      1      1      1      1      1      1      1 
##     52     57    580    605     61    615    634    637    640    649    653 
##      1      1      1      1      1      1      1      1      1      1      1 
##    669    671    674    684    690    691    700    703    704    720     75 
##      1      1      1      1      1      1      1      1      1      1      1 
##    791    803    817    826    832    833    834    837    838    839     84 
##      1      1      1      1      1      1      1      1      1      1      1 
##    842    848    870    878    885    903    904    906    911    915    917 
##      1      1      1      1      1      1      1      1      1      1      1 
##    923    939    944    955    957    963    971    975    976     98    980 
##      1      1      1      1      1      1      1      1      1      1      1 
##    982    994   E909    V25    V43    V51    V60    V67    V70    110    115 
##      1      1      1      1      1      1      1      1      1      2      2 
##    146    147    149    164    170    173    187    194    195    208    236 
##      2      2      2      2      2      2      2      2      2      2      2 
##    246 250.53 250.91    308     31    322    324    336    352     36    360 
##      2      2      2      2      2      2      2      2      2      2      2 
##    369    377    384    385     39    417    422     48    501    542    582 
##      2      2      2      2      2      2      2      2      2      2      2 
##    583    610    623    632    643    645    657    683    696      7    706 
##      2      2      2      2      2      2      2      2      2      2      2 
##    732    735    753    806    862    868    871    875    880    897    913 
##      2      2      2      2      2      2      2      2      2      2      2 
##    914    934    936    941    983    990    V26    172    179    192    240 
##      2      2      2      2      2      2      2      3      3      3      3 
##    245 250.52    261    266    356    370    374    382    405    483    508 
##      3      3      3      3      3      3      3      3      3      3      3 
##    524    529    602    603    619    665    709    795    797    846    866 
##      3      3      3      3      3      3      3      3      3      3      3 
##    879    916    921    928    942    986    987    136    142    175  250.5 
##      3      3      3      3      3      3      3      4      4      4      4 
##    272    301     34    341    379    388    397    452    470    474    495 
##      4      4      4      4      4      4      4      4      4      4      4 
##    570    598    633    647    734    745    747    792    793    796    800 
##      4      4      4      4      4      4      4      4      4      4      4 
##    831    836    863    886     94    141    201    215    228  250.9    273 
##      4      4      4      4      4      5      5      5      5      5      5 
##    342    344    361    395     41    445    463     49    551    622    655 
##      5      5      5      5      5      5      5      5      5      5      5 
##    685    692    705    717    751    759    814    815    835    854    881 
##      5      5      5      5      5      5      5      5      5      5      5 
##    890    893    968    991    V45    163    180    214    223    237    306 
##      5      5      5      5      5      6      6      6      6      6      6 
##    320    335    353    354    457    526    541    579    641     66    663 
##      6      6      6      6      6      6      6      6      6      6      6 
##    686    694    725     82    843    883    892    952    964    V63    117 
##      6      6      6      6      6      6      6      6      6      6      7 
##    210    212 250.21    337    359    362    383    464    588    708    746 
##      7      7      7      7      7      7      7      7      7      7      7 
##     78    816    845    847    865     88     11    158 250.31    350    454 
##      7      7      7      7      7      7      8      8      8      8      8 
##      5    521    695    737    864    891    970    992    V71    152    171 
##      8      8      8      8      8      8      8      8      8      9      9 
##    230    263    312     35    394    565    646    810    882    935    966 
##      9      9      9      9      9      9      9      9      9      9      9 
##    967      ?    161    184      3    323    338    376    513    652    736 
##      9     10     10     10     10     10     10     10     10     10     10 
##    861    867    933    977    989    199    200    244    297    368    462 
##     10     10     10     10     10     11     11     11     11     11     11 
##     54    840    V56    156 250.43 250.93    281    310    358    494    533 
##     11     11     11     12     12     12     12     12     12     12     12 
##    581    591    594    660    680    693    853    945 250.33    251    355 
##     12     12     12     12     12     12     12     12     13     13     13 
##    475    480    586    601    718  250.3 250.32    289    357    378    756 
##     13     13     13     13     13     14     14     14     14     14     14 
##    873    972    204    255    283    627    958    252    461    514    783 
##     14     14     15     15     15     15     15     16     16     16     16 
##    242    380    473    617    710    999    135    226    282    446    527 
##     17     17     17     17     17     17     18     18     18     18     18 
##    528    614    656    664    205 250.23    307    429    430    456    568 
##     18     18     18     18     19     19     19     19     19     19     19 
##    607    616    825    196    305    327    333    421    534    621    794 
##     19     19     19     20     20     20     20     20     20     20     20 
##    844    920    277    802    522    150    193    203    235    239    396 
##     20     20     21     21     22     23     23     23     23     23     23 
##     47    510    516    644    658    183    309    442    478    567    611 
##     23     23     23     23     23     24     24     24     24     24     24 
##    661    851    340    349    492    304    351    788    801    233    596 
##     24     24     25     25     25     26     26     26     26     27     27 
##    604    714    860     70     42    512    519    573    659      9    922 
##     27     27     27     28     29     29     29     29     29     31     31 
##    253    275    293    451    485    959    155    286    556    V53    191 
##     32     32     32     32     32     32     33     33     33     33     34 
##    465    284    332    V54    620    719    965 250.92    220    238    555 
##     34     35     35     35     36     36     37     38     39     39     39 
##    924    225    227    550    727    423    447    585    822    151    311 
##     39     41     41     41     41     42     42     42     42     43     43 
##    416    496     53    V55    716    962    112    298    303    969    288 
##     43     43     43     43     44     44     45     45     45     45     47 
##    738    420    799 250.01    241    432    608    995 250.42    642    850 
##     47     48     50     51     52     52     52     52     53     53     53 
##    711    300    443    481    438    625    287    290    459    807    425 
##     54     55     55     56     58     58     59     60     61     62     63 
##    723    782    490    537    346    398    595 250.41    593 250.83    785 
##     63     63     64     64     65     65     65     66     66     68     68 
##    813    455     79    345    437    729    274    291    576    626    292 
##     68     69     69     70     70     70     71     71     71     71     73 
##    202    566    726    154    157    294    852    781    790    188    681 
##     74     74     75     76     76     76     77     78     79     80     80 
##    487    654    515    536    348  250.2    211    728    189    808    413 
##     81     81     85     85     87     90     92     94     95     95     96 
##    441    564    182    823    444    431    784    553    572    V58    218 
##     98     99    100    103    107    108    108    109    109    115    116 
##    386 250.81 250.22    174    821    331    557    436    805    730    424 
##    116    117    118    123    124    133    134    137    138    140    141 
##    198    618    721    404    511    185    575    707    787    532    600 
##    148    149    149    152    153    155    155    156    162    165    165 
##    571    540 250.03    569    197    466    552    812  250.4    482    411 
##    166    170    173    177    179    182    183    188    190    193    194 
##    250    280    285    531  250.1    558    426    648    590    997    153 
##    200    207    211    212    218    230    241    242    249    249    250 
##    401    733    824    458 250.82    162    295      8    592 250.12    535 
##    250    250    254    260    264    265    275    275    284    293    296 
##    415    402    403    507    724    278    453    530    789    578 250.11 
##    304    306    310    314    339    357    360    364    365    410    427 
##    998 250.13  250.7 250.02    560    518    440    433  250.6    296    V57 
##    451    516    522    523    561    562    607    608    631    634    654 
##    722    493    577    562    435    574    820    584    599     38  250.8 
##    660    676    682    709    752    773    815    918    975    998   1074 
##    996    276    491    780    682    434    715    427    486    410    786 
##   1106   1180   1313   1409   1463   1514   1907   2019   2362   2774   3040 
##    428    414 
##   3876   5209
# We will create a new attribute with larger categories for the diagnoses so they are less cumbersome. The categories will be based on ICD-9-CM categories, with the exception of diabetes mellitus as its own category so that it may be analyzed. The unknown category ('?') will be kept for this step. Unfortunately, we have to use Regex to condense the categories as the attribute is currently a factor with some categories containing letters
db.expl$diag_1 <- rep('?', times = nrow(db.init))
db.expl$diag_1[grepl("^0|^[^EeVv?]$|^[^EeVv][0-9]$|^1[0123][0-9]$",db.init$diag_1)] <- "001-139"
db.expl$diag_1[grepl("^1[4-9][0-9]$|^2[0-3][0-9]$",db.init$diag_1)] <- "140-239"
db.expl$diag_1[grepl("^250.*",db.init$diag_1)] <- "250.0-250.9"
db.expl$diag_1[grepl("^2[467][0-9]$|^25[1-9]$",db.init$diag_1)] <- "240-279(not250)"
db.expl$diag_1[grepl("^28[0-9]$",db.init$diag_1)] <- "280-289"
db.expl$diag_1[grepl("^29[0-9]$|^3[01][0-9]$",db.init$diag_1)] <- "290-319"
db.expl$diag_1[grepl("^3[2-8][0-9]$",db.init$diag_1)] <- "320-389"
db.expl$diag_1[grepl("^39[0-9]$|^4[0-5][0-9]$",db.init$diag_1)] <- "390-459"
db.expl$diag_1[grepl("^4[6-9][0-9]$|^5[01][0-9]$",db.init$diag_1)] <- "460-519"
db.expl$diag_1[grepl("^5[2-7][0-9]$",db.init$diag_1)] <- "520-579"
db.expl$diag_1[grepl("^5[89][0-9]$|^6[0-2][0-9]$",db.init$diag_1)] <- "580-629"
db.expl$diag_1[grepl("^6[3-7][0-9]$",db.init$diag_1)] <- "630-679"
db.expl$diag_1[grepl("^6[89][0-9]$|^70[0-9]$",db.init$diag_1)] <- "680-709"
db.expl$diag_1[grepl("^7[1-3][0-9]$",db.init$diag_1)] <- "710-739"
db.expl$diag_1[grepl("^7[45][0-9]$",db.init$diag_1)] <- "740-759"
db.expl$diag_1[grepl("^7[67][0-9]$",db.init$diag_1)] <- "760-779"
db.expl$diag_1[grepl("^7[89][0-9]$",db.init$diag_1)] <- "780-799"
db.expl$diag_1[grepl("^[89][0-9]{2}$",db.init$diag_1)] <- "800-999"
db.expl$diag_1[grepl("^[Vv]",db.init$diag_1)] <- "V01-V91"
db.expl$diag_1[grepl("^[Ee]",db.init$diag_1)] <- "E000-E999"

db.expl$diag_1 <- as.factor(db.expl$diag_1) # Convert from character to factor
levels(db.expl$diag_1) # Now has 20 levels
##  [1] "?"               "001-139"         "140-239"         "240-279(not250)"
##  [5] "250.0-250.9"     "280-289"         "290-319"         "320-389"        
##  [9] "390-459"         "460-519"         "520-579"         "580-629"        
## [13] "630-679"         "680-709"         "710-739"         "740-759"        
## [17] "780-799"         "800-999"         "E000-E999"       "V01-V91"
sort(prop.table(table(db.expl$diag_1))) # Largest group of people had a primary diagnosis of a circulatory disorder (30.5%); 0.001% of cases are missing; will impute to majority class (390-459)
## 
##       E000-E999               ?         740-759         630-679         280-289 
##    1.429123e-05    1.429123e-04    5.859403e-04    8.374659e-03    9.303589e-03 
##         320-389         V01-V91         290-319         001-139         680-709 
##    1.226187e-02    1.311935e-02    2.207995e-02    2.408072e-02    2.543838e-02 
## 240-279(not250)         140-239         580-629         710-739         800-999 
##    2.643877e-02    3.627113e-02    4.879025e-02    5.807954e-02    6.708302e-02 
##         780-799     250.0-250.9         520-579         460-519         390-459 
##    7.864462e-02    8.214597e-02    9.039201e-02    9.212125e-02    3.046318e-01
ggplot(db.expl, aes(fct_infreq(diag_1))) + geom_bar(color = 'black', fill = '#83afe6') + theme_bw() + labs(x = 'Primary Diagnosis', y = 'Count') + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Visualize groups

# Want to reduce the levels further; will group anything under 5% into an "Other" category
diag1levels <- levels(db.expl$diag_1)[c(2:4,6:8,12:14,16,19,20)] 
db.expl$diag_1 <- droplevels(fct_collapse(db.expl$diag_1, '390-459' = c('390-459', '?'), 'Other' = diag1levels)) # Drop empty levels, and collapse categories - kept the categories over 5%; put rest in "Other"
levels(db.expl$diag_1) #Reduced to 8 levels to be more manageable for modeling
## [1] "390-459"     "Other"       "250.0-250.9" "460-519"     "520-579"    
## [6] "710-739"     "780-799"     "800-999"
table(db.expl$diag_1)
## 
##     390-459       Other 250.0-250.9     460-519     520-579     710-739 
##       21326       15867        5748        6446        6325        4064 
##     780-799     800-999 
##        5503        4694
sort(prop.table(table(db.expl$diag_1))) # Other category contains 22.7% of cases
## 
##     710-739     800-999     780-799 250.0-250.9     520-579     460-519 
##  0.05807954  0.06708302  0.07864462  0.08214597  0.09039201  0.09212125 
##       Other     390-459 
##  0.22675889  0.30477470

Diagnosis 2

length(levels(db.init$diag_2)) # 749 distinct secondary diagnoses
## [1] 749
sort(table(db.init$diag_2)) # Code 250 has the highest number of cases (4996); however, there are likely more, as 250 is split into around 10 categories. The second highest is 276 (4494 cases), which is "Disorders of fluid electrolyte and acid-base balance". Third highest is code 428 (4218 cases), which is Heart Failure.
## 
##    115    130    195 250.21  250.3 250.31    325    341    350    364     52 
##      0      0      0      0      0      0      0      0      0      0      0 
##    695    754    806    846    884    894    912     96    975    990   E813 
##      0      0      0      0      0      0      0      0      0      0      0 
##   E941   E945    V69    111    114    123    137    140    145    163    164 
##      0      0      0      1      1      1      1      1      1      1      1 
##    171    173    180    182    186    192    212    223    232    235    256 
##      1      1      1      1      1      1      1      1      1      1      1 
##    268     27    270    271    302     31    316    347     35    353    366 
##      1      1      1      1      1      1      1      1      1      1      1 
##    374    388     46    460    472    475    483      5    506    520    523 
##      1      1      1      1      1      1      1      1      1      1      1 
##    529    534    580    602    605    610    615    649    658     66    665 
##      1      1      1      1      1      1      1      1      1      1      1 
##    670    674    683    684    685    694      7    702    703    704    734 
##      1      1      1      1      1      1      1      1      1      1      1 
##    748     75    752    800    811    832    833    843    853    863    871 
##      1      1      1      1      1      1      1      1      1      1      1 
##    879     88    880    893    908    911    915    916    917    927    942 
##      1      1      1      1      1      1      1      1      1      1      1 
##    944    947    948    953    955    962    963    968    974    977    987 
##      1      1      1      1      1      1      1      1      1      1      1 
##     99    992    994   E817   E818   E821   E826   E829   E850   E854   E868 
##      1      1      1      1      1      1      1      1      1      1      1 
##   E882   E883   E890   E900   E915   E918   E919   E929   E938   E965   E968 
##      1      1      1      1      1      1      1      1      1      1      1 
##   E980    V03    V11    V13    V25    V50    V55    V60    V61    V66     11 
##      1      1      1      1      1      1      1      1      1      1      2 
##    141    179    215    217    226    228    240    246    259    269    318 
##      2      2      2      2      2      2      2      2      2      2      2 
##    352    360    376    377    379    381    383    405    464    474    477 
##      2      2      2      2      2      2      2      2      2      2      2 
##    484    501    543    622    634    641    645    654    661    663    686 
##      2      2      2      2      2      2      2      2      2      2      2 
##    742    751    755     78    795    797    801    814    822    831    837 
##      2      2      2      2      2      2      2      2      2      2      2 
##    862    866    869    872    906    918    933    945    952    967    972 
##      2      2      2      2      2      2      2      2      2      2      2 
##    980    991   E814   E819   E853   E887   E916   E924   E931    V16    V53 
##      2      2      2      2      2      2      2      2      2      2      2 
##    V86    131    152    225    239 250.23 250.32    297    306    308    320 
##      2      3      3      3      3      3      3      3      3      3      3 
##    351    355    373    395    422    448    463    470    495    500    508 
##      3      3      3      3      3      3      3      3      3      3      3 
##    513    522    527    542    588    603    619    656    659    664    691 
##      3      3      3      3      3      3      3      3      3      3      3 
##    692    725    747    815    826    842    870    913    919    921    922 
##      3      3      3      3      3      3      3      3      3      3      3 
##    965   E816   E858   E870   E905   E906   E930   E937    V09    V14    V23 
##      3      3      3      3      3      3      3      3      3      3      3 
##    193 250.33    258    310    323    324    369    454    457    480    579 
##      4      4      4      4      4      4      4      4      4      4      4 
##    623    647    652    705    712    713    758    810    844    851    892 
##      4      4      4      4      4      4      4      4      4      4      4 
##    907    909    989   E881   E927   E936    V02    172    227    245    317 
##      4      4      4      4      4      4      4      5      5      5      5 
##    335    336    343    372    430    452    521    594    598    644    698 
##      5      5      5      5      5      5      5      5      5      5      5 
##    741    759    793    865    868    934    969   E917   E933    110    136 
##      5      5      5      5      5      5      5      5      5      6      6 
##    191    208    214 250.91    273    322    380    382    389    494    510 
##      6      6      6      6      6      6      6      6      6      6      6 
##    514    528    565    604    709    750    791    847    864    867    891 
##      6      6      6      6      6      6      6      6      6      6      6 
##      9    910    V08    V44    183    188    199    220    241  250.9    260 
##      6      6      6      6      7      7      7      7      7      7      7 
##    262    299     40    421    524     54    540    586    646    701    792 
##      7      7      7      7      7      7      7      7      7      7      7 
##    796    836    861    882   E812    V70    117    156    251    266     34 
##      7      7      7      7      7      7      8      8      8      8      8 
##    365    485    517    533    570    608    621    845    883    905    923 
##      8      8      8      8      8      8      8      8      8      8      8 
##   E934   E950    V18  250.2    261    279    333    338    354    461    627 
##      8      8      8      9      9      9      9      9      9      9      9 
##    706    737    825    V17    V46    V57    200    281    307    356    423 
##      9      9      9      9      9      9     10     10     10     10     10 
##    693    696    717    816    881    959   E939   E947    233    314    319 
##     10     10     10     10     10     10     10     10     11     11     11 
##    431    442    550    745    756    823   E928   E944  250.1    359    378 
##     11     11     11     11     11     11     11     11     12     12     12 
##    487    555    718    958    201    840    852   E879    V72    151 250.93 
##     12     12     12     12     13     13     13     13     13     14     14 
##    283    556    607    614    753    824    V49    185    312    346    394 
##     14     14     14     14     14     14     14     15     15     15     15 
##    432    436    462    583    783   E880    238    566    736    746    860 
##     15     15     15     15     15     15     16     16     16     16     16 
##     94    138 250.53    298    516    802    821    155    174    252    274 
##     16     17     17     17     17     17     17     18     18     18     18 
##    349    420    611    617    680    716   E884    V64    358    386    447 
##     18     18     18     18     18     18     18     18     19     19     19 
##    519    601    616    924    368    532    537    738    813    920   E935 
##     19     19     19     19     20     20     20     20     20     20     20 
##    V65    253    150 250.43    446    557    362    455    478    626    726 
##     20     21     22     22     22     22     23     23     23     23     23 
##    327    345    642    727    429    850    999   E942    332     53     79 
##     24     24     24     24     25     25     25     25     26     26     26 
##    V63 250.83    473    189 250.22    289    451    620    812    154    275 
##     26     27     27     28     28     28     28     28     28     29     29 
##    398    820    290    441    481    711    714    808    282    337    531 
##     29     29     30     30     30     31     31     31     32     32     32 
##    600   E932    618    205    681    V54    873    459    573    512    552 
##     32     32     33     34     34     34     35     36     37     38     38 
##    277    309    211    465    572    301    490    625    807    V12    456 
##     39     39     40     40     40     41     41     41     41     41     42 
##    596    157    218 250.52    V62    344    721    255    203    723   E849 
##     42     43     43     43     43     44     45     47     48     48     48 
##    794    443    595    291    581 250.51 250.11  250.5 250.81    437    568 
##     49     51     51     52     52     53     54     54     54     54     54 
##    397     42    731    357    564    782    293    311    575    288    576 
##     55     55     55     56     56     56     57     57     57     58     59 
##    592    242 250.42    553    567    805    482 250.12 250.13    416    435 
##     59     60     60     60     60     60     62     64     65     65     65 
##    292    719    153    415    444    340    558    135  250.7    204    729 
##     68     68     71     71     72     75     75     76     76     77     77 
##    995    286    348    466    V15   E878    331    412   E888    V10    593 
##     77     79     79     79     79     80     81     84     84     85     87 
##    V43    728    536   E885    284    304    507    715    781    562    590 
##     88     90     93     93     94     95     95     95    101    103    103 
##    202  250.4    722    515    263    296    458    724    V58 250.82    300 
##    104    106    106    108    109    109    109    109    109    110    110 
##    434      8 250.41    433    244    294    112    438    733    404    V85 
##    112    113    117    117    118    118    121    123    124    125    127 
##    569    710    396  250.8    535    790    730    784 250.92    787    492 
##    128    130    132    138    141    141    142    143    148    150    154 
## 250.03    V42    196    342    591    799    303     70    198    789    530 
##    164    173    174    183    184    190    192    192    206    209    212 
##    287    278    453    162    578    571    197     38    426    577    402 
##    220    222    224    226    226    232    236    243    246    250    251 
##    440    V45    560    648     41    788    295      ?    574    785    996 
##    260    265    273    283    286    286    291    293    297    297    304 
##    997    511    272    410    998    280    786    305  250.6    493    518 
##    304    314    348    363    429    432    480    504    584    627    765 
##    413    424    486    425    491    585    682    584    780    285 250.01 
##    810    813    830    868    881    908    927   1004   1022   1108   1120 
##    707 250.02    403    414    411    496    599    401    427    428    276 
##   1183   1542   1576   1977   2034   2188   2210   3077   3445   4218   4494 
##    250 
##   4996
# Same as for diagnosis 1, we will create a new attribute with larger categories for the diagnoses.
db.expl$diag_2 <- rep('?', times = nrow(db.init))
db.expl$diag_2[grepl("^0|^[^EeVv?]$|^[^EeVv][0-9]$|^1[0123][0-9]$",db.init$diag_2)] <- "001-139"
db.expl$diag_2[grepl("^1[4-9][0-9]$|^2[0-3][0-9]$",db.init$diag_2)] <- "140-239"
db.expl$diag_2[grepl("^250.*",db.init$diag_2)] <- "250.0-250.9"
db.expl$diag_2[grepl("^2[467][0-9]$|^25[1-9]$",db.init$diag_2)] <- "240-279(not250)"
db.expl$diag_2[grepl("^28[0-9]$",db.init$diag_2)] <- "280-289"
db.expl$diag_2[grepl("^29[0-9]$|^3[01][0-9]$",db.init$diag_2)] <- "290-319"
db.expl$diag_2[grepl("^3[2-8][0-9]$",db.init$diag_2)] <- "320-389"
db.expl$diag_2[grepl("^39[0-9]$|^4[0-5][0-9]$",db.init$diag_2)] <- "390-459"
db.expl$diag_2[grepl("^4[6-9][0-9]$|^5[01][0-9]$",db.init$diag_2)] <- "460-519"
db.expl$diag_2[grepl("^5[2-7][0-9]$",db.init$diag_2)] <- "520-579"
db.expl$diag_2[grepl("^5[89][0-9]$|^6[0-2][0-9]$",db.init$diag_2)] <- "580-629"
db.expl$diag_2[grepl("^6[3-7][0-9]$",db.init$diag_2)] <- "630-679"
db.expl$diag_2[grepl("^6[89][0-9]$|^70[0-9]$",db.init$diag_2)] <- "680-709"
db.expl$diag_2[grepl("^7[1-3][0-9]$",db.init$diag_2)] <- "710-739"
db.expl$diag_2[grepl("^7[45][0-9]$",db.init$diag_2)] <- "740-759"
db.expl$diag_2[grepl("^7[67][0-9]$",db.init$diag_2)] <- "760-779"
db.expl$diag_2[grepl("^7[89][0-9]$",db.init$diag_2)] <- "780-799"
db.expl$diag_2[grepl("^[89][0-9]{2}$",db.init$diag_2)] <- "800-999"
db.expl$diag_2[grepl("^[Vv]",db.init$diag_2)] <- "V01-V91"
db.expl$diag_2[grepl("^[Ee]",db.init$diag_2)] <- "E000-E999"

db.expl$diag_2 <- as.factor(db.expl$diag_2) # Convert from character to factor
levels(db.expl$diag_2) # Now has 20 levels
##  [1] "?"               "001-139"         "140-239"         "240-279(not250)"
##  [5] "250.0-250.9"     "280-289"         "290-319"         "320-389"        
##  [9] "390-459"         "460-519"         "520-579"         "580-629"        
## [13] "630-679"         "680-709"         "710-739"         "740-759"        
## [17] "780-799"         "800-999"         "E000-E999"       "V01-V91"
sort(prop.table(table(db.expl$diag_2))) # Largest group of people had a secondary diagnosis of a circulatory disorder (31.1%); 0.4% missing, will impute to majority class (390-459)
## 
##         740-759               ?         630-679       E000-E999         320-389 
##     0.001186172     0.004187329     0.005044803     0.008145999     0.012776357 
##         V01-V91         001-139         710-739         140-239         800-999 
##     0.017392423     0.017721121     0.018507138     0.022851671     0.026067197 
##         290-319         280-289         680-709         520-579         780-799 
##     0.026524517     0.029654295     0.031840853     0.038643477     0.045274606 
##         580-629 240-279(not250)         460-519     250.0-250.9         390-459 
##     0.072056365     0.080102325     0.092106956     0.138624898     0.311291498
ggplot(db.expl, aes(fct_infreq(diag_2))) + geom_bar(color = 'black', fill = '#83afe6') + theme_bw() + labs(x = 'Secondary Diagnosis', y = 'Count') + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Visualize groups

# Want to reduce the levels further; will group anything under 5% into an "Other" category
diag2levels <- levels(db.expl$diag_2)[c(2,3,6:8,11,13:20)] 
db.expl$diag_2 <- droplevels(fct_collapse(db.expl$diag_2, '390-459' = c('390-459', '?'), Other = diag2levels)) # Drop empty levels, and collapse categories - kept the categories over 5%; put rest in "Other"
levels(db.expl$diag_2) #Reduced to 6 levels
## [1] "390-459"         "Other"           "240-279(not250)" "250.0-250.9"    
## [5] "460-519"         "580-629"
table(db.expl$diag_2)
## 
##         390-459           Other 240-279(not250)     250.0-250.9         460-519 
##           22075           21106            5605            9700            6445 
##         580-629 
##            5042
sort(prop.table(table(db.expl$diag_2))) # Other category contains 30.2% of cases
## 
##         580-629 240-279(not250)         460-519     250.0-250.9           Other 
##      0.07205636      0.08010233      0.09210696      0.13862490      0.30163063 
##         390-459 
##      0.31547883

Diagnosis 3

length(levels(db.init$diag_3)) # 790 distinct tertiary diagnoses
## [1] 790
sort(table(db.init$diag_3)) # Code 250 has the highest number of cases (8980); however, there are likely more, as 250 is split into around 10 categories. The second highest is 401 (6549 cases), which is "Essential hypertension". Third highest is code 276 (3302 cases), which is "Disorders of fluid electrolyte and acid-base balance"
## 
##     11    111    132    146    175    186    215    230    236      3    448 
##      0      0      0      0      0      0      0      0      0      0      0 
##    475    483    485    538    671    732    848    863    870     88    884 
##      0      0      0      0      0      0      0      0      0      0      0 
##    935    942    944    980    992   E864   E876   E915   E955   E987    V22 
##      0      0      0      0      0      0      0      0      0      0      0 
##    115    122    123    139     14    141    148    152    158    161    164 
##      1      1      1      1      1      1      1      1      1      1      1 
##     17    171    193    195    217    223    226    243 250.21  250.3 250.31 
##      1      1      1      1      1      1      1      1      1      1      1 
##    258    265     27    271    308    314    315    341    361 365.44    370 
##      1      1      1      1      1      1      1      1      1      1      1 
##    381    385    387    391    395    430    460    463     47    480    484 
##      1      1      1      1      1      1      1      1      1      1      1 
##     49    524    540    542     57    602    603    622    643    657     66 
##      1      1      1      1      1      1      1      1      1      1      1 
##    669    674    684    690    697    744     75    750    754    755    757 
##      1      1      1      1      1      1      1      1      1      1      1 
##    759    800    801    811    822    834    837    838    841    844    853 
##      1      1      1      1      1      1      1      1      1      1      1 
##    854    864    871    872    875    876    877    879    880    890    893 
##      1      1      1      1      1      1      1      1      1      1      1 
##    928    930    943    951    952    953    955    967    970    971    972 
##      1      1      1      1      1      1      1      1      1      1      1 
##    987    989   E815   E822   E825   E826   E828   E852   E853   E854   E855 
##      1      1      1      1      1      1      1      1      1      1      1 
##   E861   E865   E882   E883   E886   E892   E894   E900   E901   E904   E912 
##      1      1      1      1      1      1      1      1      1      1      1 
##   E922   E943   E945   E949   E965   E966    V01    V06    V07    136    173 
##      1      1      1      1      1      1      1      1      1      2      2 
##    180    192    216    220 250.23    262    268    313    323    350    353 
##      2      2      2      2      2      2      2      2      2      2      2 
##    360    374    376    377    384    421    445    500    508    523    529 
##      2      2      2      2      2      2      2      2      2      2      2 
##    534    550    597    624    641    670      7    702    706    734    735 
##      2      2      2      2      2      2      2      2      2      2      2 
##    795    797    814    816    851    852    862    866    868    911    912 
##      2      2      2      2      2      2      2      2      2      2      2 
##    918    919    948    956    966   E813   E817   E919   E920   E924   E946 
##      2      2      2      2      2      2      2      2      2      2      2 
##   E980    V02    V11    V25    V53    V70    156    163    172    179    182 
##      2      2      2      2      2      2      3      3      3      3      3 
##    214    245    259    299    334    335    336    347    373    379    417 
##      3      3      3      3      3      3      3      3      3      3      3 
##    452    470    495      5    506    525    565    619    655    661    685 
##      3      3      3      3      3      3      3      3      3      3      3 
##    703    720     78    826    842    865    910    913    915    921    933 
##      3      3      3      3      3      3      3      3      3      3      3 
##   E858   E887   E905   E916   E917   E937   E956    V03    V60    V86    155 
##      3      3      3      3      3      3      3      3      3      3      4 
##    170    191    233    235    239    240 250.91    260    270     35    359 
##      4      4      4      4      4      4      4      4      4      4      4 
##    472    481    510    528    543    566    579    604    653    665    742 
##      4      4      4      4      4      4      4      4      4      4      4 
##    758    831    847    850    860    891      9    906    908    909    934 
##      4      4      4      4      4      4      4      4      4      4      4 
##   E818   E850   E906   E936   E941    V23    V61    200    246    256    306 
##      4      4      4      4      4      4      4      5      5      5      5 
##     34    366    372    383    388    420    436    454    464    516    527 
##      5      5      5      5      5      5      5      5      5      5      5 
##    623    644    658    704    712    741    751    752    815    945    962 
##      5      5      5      5      5      5      5      5      5      5      5 
##    965    991   E927   E938    V13    V66    117    131    151    225    227 
##      5      5      5      5      5      5      6      6      6      6      6 
##    228  250.2 250.22    297    380    431    432    557    570    605    660 
##      6      6      6      6      6      6      6      6      6      6      6 
##    701    705    756    810    821    845    867    881    882    917   E870 
##      6      6      6      6      6      6      6      6      6      6      6 
##   E929    V55    V57    150    183    201    251    279    354    451    582 
##      6      6      6      7      7      7      7      7      7      7      7 
##    594    610    652    686    708    713    717    746    796    825    836 
##      7      7      7      7      7      7      7      7      7      7      7 
##    892    V16    208    273    283    310    312    405    494    501    514 
##      7      7      8      8      8      8      8      8      8      8      8 
##    649    680    694    695    718    747    793    823    861    883    958 
##      8      8      8      8      8      8      8      8      8      8      8 
##    969   E819   E881   E928    V72    241 250.13  250.9 250.93    307    318 
##      8      8      8      8      8      9      9      9      9      9      9 
##    389    580    598    607    698    753    907   E816    261    351    382 
##      9      9      9      9      9      9      9      9     10     10     10 
##    522    586    608    647    656    791    820    V63    154    266    317 
##     10     10     10     10     10     10     10     10     11     11     11 
##    378    394    487    521     54    611    621    692    709    725    808 
##     11     11     11     11     11     11     11     11     11     11     11 
##    840    916   E931    V14    188 250.11    343    556    654    693    792 
##     11     11     11     11     12     12     12     12     12     12     12 
##    959    138    205    349    355     42     53    552    646    663    922 
##     12     13     13     13     13     13     13     13     13     13     13 
##    923   E933   E934    V08    110    358    442    462    533    537    664 
##     13     13     13     13     14     14     14     14     14     14     14 
##    711    905   E930   E939   E950    365    398    477    627    745   E944 
##     14     14     14     14     14     15     15     15     15     15     15 
##    V18    157  250.1 250.83    356    386    461    588    601    737    802 
##     15     16     16     16     16     16     16     16     16     16     16 
##   E884    369    457    532    738    813     94    189    252    555    567 
##     16     17     17     17     17     17     17     18     18     18     18 
##   E812    281    736    824    920    999    V65    346    415    423    723 
##     18     19     19     19     19     19     19     20     20     20     20 
##    V44    V62    446    696    805    V17    199 250.53    319    512    620 
##     20     20     21     21     21     21     22     22     22     22     22 
##    812   E880    298    345    614   E935    333    338    456    617    626 
##     22     22     23     23     23     23     24     24     24     24     24 
##    807   E947    447    517    519    572    616    659    727    238 250.81 
##     24     24     25     25     25     25     25     25     25     26     26 
##    277    289 250.12    282    478    618    924    V49    291    482    642 
##     26     26     27     28     28     28     28     28     29     29     29 
##    153    174    368    726    590 250.43    309    V27    253    434    444 
##     30     30     30     30     31     32     32     32     33     34     34 
##    575    576     79   E942    V46    V64    595   E879    441    573    873 
##     34     34     34     34     34     34     35     35     36     36     36 
##    203    185    292    783    340    344    490    507    V54    435    721 
##     37     38     38     38     39     39     39     39     39     40     40 
##    531    348    473    255    581    568    135    429    625    204    218 
##     42     43     43     44     44     45     46     46     46     48     48 
##    274    327    592   E932    465    293    242    362    600    583    301 
##     49     50     50     50     51     54     55     55     55     57     58 
##    337    455    681    275 250.52    290    596 250.82    332    437    V09 
##     58     58     58     59     60     60     62     63     63     63     64 
##    714    719 250.42  250.5    716    202    196    286    V85    728    459 
##     65     65     67     68     68     69     70     70     71     72     73 
##    288    304    569     38    710    211    404    558 250.51    731    284 
##     77     77     77     78     78     79     79     79     80     80     81 
##    794  250.7    782    466   E885    577    722    648 250.03    591    162 
##     84     85     85     87     90     91     92     94     95     95     97 
## 250.41    515    564    781   E888    724 250.92    574    416    553   E878 
##     97     97     99     99     99    101    104    106    107    107    107 
##    729    397      8    433    331    453    730    342    536    410    263 
##    110    112    112    113    119    123    124    125    126    128    129 
##    198    296    440    443    492    733    303    790    V43    V12    562 
##    130    132    135    135    135    135    138    139    139    140    143 
##    112    311    715    V42    578    300     70    784    438    996    396 
##    148    151    152    152    159    164    169    169    173    174    179 
##    789    995   E849    V10    571    458    799    295    426    535    294 
##    181    182    182    182    183    189    192    194    194    198    204 
##  250.8    511    560    197    413    998    V15    785    997  250.4    593 
##    209    214    216    229    236    237    239    241    241    247    251 
##    412    787    788    402    287    357    280    411    486    491    V58 
##    252    253    262    274    284    289    290    307    329    346    348 
##    786    244    530    493     41    518    682    278    584 250.01  250.6 
##    405    411    469    504    514    519    539    559    591    638    660 
##    305    425    707    424    285    V45 250.02    780    585      ?    599 
##    704    707    766    778    840    888    894    901    920   1224   1240 
##    403    496    272    427    414    428    276    401    250 
##   1258   1594   1630   2590   2646   2753   3302   6549   8980
# Same as for diagnosis 1 and 2, we will create a new attribute with larger categories for the diagnoses.
db.expl$diag_3 <- rep('?', times = nrow(db.init))
db.expl$diag_3[grepl("^0|^[^EeVv?]$|^[^EeVv][0-9]$|^1[0123][0-9]$",db.init$diag_3)] <- "001-139"
db.expl$diag_3[grepl("^1[4-9][0-9]$|^2[0-3][0-9]$",db.init$diag_3)] <- "140-239"
db.expl$diag_3[grepl("^250.*",db.init$diag_3)] <- "250.0-250.9"
db.expl$diag_3[grepl("^2[467][0-9]$|^25[1-9]$",db.init$diag_3)] <- "240-279(not250)"
db.expl$diag_3[grepl("^28[0-9]$",db.init$diag_3)] <- "280-289"
db.expl$diag_3[grepl("^29[0-9]$|^3[01][0-9]$",db.init$diag_3)] <- "290-319"
db.expl$diag_3[grepl("^3[2-8][0-9]$",db.init$diag_3)] <- "320-389"
db.expl$diag_3[grepl("^39[0-9]$|^4[0-5][0-9]$",db.init$diag_3)] <- "390-459"
db.expl$diag_3[grepl("^4[6-9][0-9]$|^5[01][0-9]$",db.init$diag_3)] <- "460-519"
db.expl$diag_3[grepl("^5[2-7][0-9]$",db.init$diag_3)] <- "520-579"
db.expl$diag_3[grepl("^5[89][0-9]$|^6[0-2][0-9]$",db.init$diag_3)] <- "580-629"
db.expl$diag_3[grepl("^6[3-7][0-9]$",db.init$diag_3)] <- "630-679"
db.expl$diag_3[grepl("^6[89][0-9]$|^70[0-9]$",db.init$diag_3)] <- "680-709"
db.expl$diag_3[grepl("^7[1-3][0-9]$",db.init$diag_3)] <- "710-739"
db.expl$diag_3[grepl("^7[45][0-9]$",db.init$diag_3)] <- "740-759"
db.expl$diag_3[grepl("^7[67][0-9]$",db.init$diag_3)] <- "760-779"
db.expl$diag_3[grepl("^7[89][0-9]$",db.init$diag_3)] <- "780-799"
db.expl$diag_3[grepl("^[89][0-9]{2}$",db.init$diag_3)] <- "800-999"
db.expl$diag_3[grepl("^[Vv]",db.init$diag_3)] <- "V01-V91"
db.expl$diag_3[grepl("^[Ee]",db.init$diag_3)] <- "E000-E999"

db.expl$diag_3 <- as.factor(db.expl$diag_3) # Convert from character to factor
levels(db.expl$diag_3) # Now has 20 levels
##  [1] "?"               "001-139"         "140-239"         "240-279(not250)"
##  [5] "250.0-250.9"     "280-289"         "290-319"         "320-389"        
##  [9] "390-459"         "460-519"         "520-579"         "580-629"        
## [13] "630-679"         "680-709"         "710-739"         "740-759"        
## [17] "780-799"         "800-999"         "E000-E999"       "V01-V91"
sort(prop.table(table(db.expl$diag_3))) # Largest group of people had a tertiary diagnosis of a circulatory disorder (29.5%); 1.8% missing values, will be imputed to majority class (390-459)
## 
##         740-759         630-679       E000-E999         140-239               ? 
##     0.001057551     0.003901505     0.013290841     0.016363454     0.017506753 
##         001-139         320-389         710-739         800-999         680-709 
##     0.017563918     0.017635374     0.019550398     0.020136338     0.021365384 
##         280-289         290-319         520-579         V01-V91         780-799 
##     0.024623783     0.030640390     0.034956340     0.036957112     0.044159890 
##         580-629         460-519 240-279(not250)     250.0-250.9         390-459 
##     0.054092293     0.060666257     0.091506724     0.179297729     0.294727967
ggplot(db.expl, aes(fct_infreq(diag_3))) + geom_bar(color = 'black', fill = '#83afe6') + theme_bw() + labs(x = 'Tertiary Diagnosis', y = 'Count') + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Visualize groups

# Want to reduce the levels further; will group anything under 5% into an "Other" category
diag3levels <- levels(db.expl$diag_3)[c(2,3,6:8,11,13:20)] 
db.expl$diag_3 <- droplevels(fct_collapse(db.expl$diag_3, '390-459' = c('390-459', '?'), 'Other' = diag3levels)) # Drop empty levels, and collapse categories - kept the categories over 5%; put rest in "Other"
levels(db.expl$diag_3) #Reduced to 6 levels
## [1] "390-459"         "Other"           "240-279(not250)" "250.0-250.9"    
## [5] "460-519"         "580-629"
table(db.expl$diag_3)
## 
##         390-459           Other 240-279(not250)     250.0-250.9         460-519 
##           21848           21146            6403           12546            4245 
##         580-629 
##            3785
sort(prop.table(table(db.expl$diag_3))) # Other category contains 30.2% of cases
## 
##         580-629         460-519 240-279(not250)     250.0-250.9           Other 
##      0.05409229      0.06066626      0.09150672      0.17929773      0.30220228 
##         390-459 
##      0.31223472

Individual Medication Attributes

for(i in 25:47){
  x <- colnames(db.init[i])
  y <- sort(table(db.init[i]))
  z <- sort(prop.table(table(db.init[i])))
  print(c(x, y, z))
} # Review the name, frequency, and relative frequency for each individual medication. Each attribute has four levels: Down, Up, Steady, and No
##                                      Down                   Up 
##          "metformin"                "435"                "834" 
##               Steady                   No                 Down 
##              "13634"              "55070" "0.0062166835779515" 
##                   Up               Steady                   No 
## "0.0119188829977277"  "0.194846583682277"  "0.787017849742043" 
##                                          Down                     Up 
##          "repaglinide"                   "28"                   "71" 
##                 Steady                     No                   Down 
##                  "818"                "69056" "0.000400154345247453" 
##                     Up                 Steady                     No 
##  "0.00101467708973461"    "0.011690223371872"    "0.986894945193146" 
##                                          Down                     Up 
##          "nateglinide"                    "8"                   "16" 
##                 Steady                     No                   Down 
##                  "467"                "69482" "0.000114329812927844" 
##                     Up                 Steady                     No 
## "0.000228659625855687"  "0.00667400282966287"    "0.992983007731554" 
##                                          Down                     Up 
##       "chlorpropamide"                    "1"                    "4" 
##                 Steady                     No                   Down 
##                   "66"                "69902" "1.42912266159804e-05" 
##                     Up                 Steady                     No 
## "5.71649064639218e-05"  "0.00094322095665471"    "0.998985322910265" 
##                                        Down                    Up 
##         "glimepiride"                 "136"                 "230" 
##                Steady                    No                  Down 
##                "3331"               "66276" "0.00194360681977334" 
##                    Up                Steady                    No 
##  "0.0032869821216755"  "0.0476040758578309"    "0.94716533520072" 
##                                        Steady                     No 
##        "acetohexamide"                    "1"                "69972" 
##                 Steady                     No 
## "1.42912266159804e-05"    "0.999985708773384" 
##                                        Down                    Up 
##           "glipizide"                 "371"                 "573" 
##                Steady                    No                  Down 
##                "8063"               "60966" "0.00530204507452875" 
##                    Up                Steady                    No 
##  "0.0081888728509568"    "0.11523016020465"   "0.871278921869864" 
##                                        Down                    Up 
##           "glyburide"                 "418"                 "613" 
##                Steady                    No                  Down 
##                "6744"               "62198" "0.00597373272547983" 
##                    Up                Steady                    No 
## "0.00876052191559602"  "0.0963800322981722"   "0.888885713060752" 
##                                        Steady                     No 
##          "tolbutamide"                   "17"                "69956" 
##                 Steady                     No 
## "0.000242950852471668"    "0.999757049147528" 
##                                        Down                    Up 
##        "pioglitazone"                  "81"                 "178" 
##                Steady                    No                  Down 
##                "5004"               "64710" "0.00115758935589442" 
##                    Up                Steady                    No 
## "0.00254383833764452"  "0.0715132979863662"   "0.924785274320095" 
##                                        Down                    Up 
##       "rosiglitazone"                  "74"                 "132" 
##                Steady                    No                  Down 
##                "4455"               "65312" "0.00105755076958255" 
##                    Up                Steady                    No 
## "0.00188644191330942"  "0.0636674145741929"   "0.933388592742915" 
##                                          Down                     Up 
##             "acarbose"                    "0"                   "10" 
##                 Steady                     No                   Down 
##                  "190"                "69773"                    "0" 
##                     Up                 Steady                     No 
## "0.000142912266159804"  "0.00271533305703629"    "0.997141754676804" 
##                                          Down                     Up 
##             "miglitol"                    "1"                    "1" 
##                 Steady                     No                   Down 
##                   "18"                "69953" "1.42912266159804e-05" 
##                     Up                 Steady                     No 
## "1.42912266159804e-05" "0.000257242079087648"     "0.99971417546768" 
##                                        Steady                     No 
##         "troglitazone"                    "3"                "69970" 
##                 Steady                     No 
## "4.28736798479414e-05"    "0.999957126320152" 
##                                            Up                 Steady 
##           "tolazamide"                    "0"                   "30" 
##                     No                     Up                 Steady 
##                "69943"                    "0" "0.000428736798479413" 
##                     No 
##    "0.999571263201521" 
##                  No        No 
## "examide"   "69973"       "1" 
##                          No            No 
## "citoglipton"       "69973"           "1" 
##                                        Up                 Down 
##            "insulin"               "6777"               "7321" 
##               Steady                   No                   Up 
##              "21617"              "34258" "0.0968516427764995" 
##                 Down               Steady                   No 
##  "0.104626070055593"  "0.308933445757649"  "0.489588841410258" 
##                                          Down                     Up 
##  "glyburide.metformin"                    "4"                    "7" 
##                 Steady                     No                   Down 
##                  "485"                "69477" "5.71649064639218e-05" 
##                     Up                 Steady                     No 
## "0.000100038586311863"  "0.00693124490875052"    "0.992911551598474" 
##                                        Steady                     No 
##  "glipizide.metformin"                    "7"                "69966" 
##                 Steady                     No 
## "0.000100038586311863"    "0.999899961413688" 
##                                                Steady 
## "glimepiride.pioglitazone"                        "0" 
##                         No                     Steady 
##                    "69973"                        "0" 
##                         No 
##                        "1" 
##                                              Steady                        No 
## "metformin.rosiglitazone"                       "2"                   "69971" 
##                    Steady                        No 
##    "2.85824532319609e-05"       "0.999971417546768" 
##                                            Steady                       No 
## "metformin.pioglitazone"                      "1"                  "69972" 
##                   Steady                       No 
##   "1.42912266159804e-05"      "0.999985708773384"
# Any attribute with one category encompassing 95% of the records or greater will be removed for modeling due to low variance. This includes 16 of the 23 medications: Repaglinide, Nateglinide, Chlorpropamide, Acetohexamide, Tolbutamide, Acarbose, Miglitol, Troglitazone, Tolazamide, Examide, Citoglipton, Glyburide-metformin, Glipizide-metformin, Glimepiride-pioglitazone, Metformin-rosiglitazone, and Metformin-pioglitazone

# Add medication attributes to keep for exploratory analysis to db.expl
db.expl$metformin <- db.init$metformin
db.expl$glimepiride <- db.init$glimepiride
db.expl$glipizide <- db.init$glipizide
db.expl$glyburide <- db.init$glyburide
db.expl$pioglitazone <- db.init$pioglitazone
db.expl$rosiglitazone <- db.init$rosiglitazone
db.expl$insulin <- db.init$insulin

Change in Medication

table(db.init$change)
## 
##    Ch    No 
## 31491 38482
sort(prop.table(table(db.init$change))) # The two categories are pretty evenly split. 45.0% had a change in diabetic medication and 55.0% did not
## 
##       Ch       No 
## 0.450045 0.549955
db.expl$change <- db.init$change # Add to exploration data frame

Diabtes Medication

table(db.init$diabetesMed)
## 
##    No   Yes 
## 16680 53293
sort(prop.table(table(db.init$diabetesMed))) #76.2% of patients are taking a diabetic medication; 23.8% are not
## 
##        No       Yes 
## 0.2383777 0.7616223
db.expl$diabetesMed <- db.init$diabetesMed # Add to exploration data frame

Target Variable (Categorical)

table(db.init$readmitted) # There are currently 3 levels: readmitted within 30 days, readmitted after 30 days, and no readmission.
## 
##   <30   >30    NO 
##  6277 22222 41474
# '>30' and 'NO' will be grouped together, as the purpose of analysis is to compare patients readmitted within 30 days to those who were not
db.expl$readmitted <- fct_collapse(db.init$readmitted, 'Not<30' = c('>30', 'NO')) 
table(db.expl$readmitted)
## 
##    <30 Not<30 
##   6277  63696
sort(prop.table(table(db.expl$readmitted))) # The target attribute is imbalanced, with only 9.0% of the patients having been readmitted within 30 days, whereas 91.0% were not
## 
##        <30     Not<30 
## 0.08970603 0.91029397

Exploration Dataframe Created

Now that we have condensed categories for certain variables, and removed attributes that were irrelevant, had a large number of missing values, or had low variance, let’s review the final structure of the dataset to be used for exploratory analysis (db.expl).

str(db.expl) # We now have a dataframe with 28 variables and 69973 records ready for further exploratory analysis
## 'data.frame':    69973 obs. of  28 variables:
##  $ time_in_hospital        : int  13 12 1 9 3 7 7 10 4 1 ...
##  $ num_lab_procedures      : int  68 33 51 47 31 62 60 55 70 49 ...
##  $ num_procedures          : int  2 3 0 2 6 0 0 1 1 5 ...
##  $ num_medications         : int  28 18 8 17 16 11 15 31 21 2 ...
##  $ number_outpatient       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_emergency        : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ number_inpatient        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_diagnoses        : int  8 8 5 9 9 7 8 8 7 8 ...
##  $ age                     : Factor w/ 10 levels "[0-10)","[10-20)",..: 9 10 5 5 6 7 5 9 7 7 ...
##  $ A1Cresult               : Factor w/ 3 levels ">7","None","Norm": 2 2 2 2 2 2 2 2 2 2 ...
##  $ race                    : Factor w/ 5 levels "Caucasian","AfricanAmerican",..: 1 1 1 2 1 2 1 1 1 2 ...
##  $ gender                  : Factor w/ 2 levels "Female","Male": 1 1 2 1 2 2 1 2 2 1 ...
##  $ admission_type_id       : Factor w/ 4 levels "Emergency","Urgent",..: 2 3 1 1 2 2 1 1 3 3 ...
##  $ discharge_disposition_id: Factor w/ 5 levels "DcHome","DcOtherFacility",..: 1 3 1 1 1 1 3 4 1 1 ...
##  $ admission_source_id     : Factor w/ 4 levels "PhysRef","TransferExtFacility",..: 2 2 3 3 2 2 3 3 2 2 ...
##  $ diag_1                  : Factor w/ 8 levels "390-459","Other",..: 1 1 2 3 1 2 1 1 1 4 ...
##  $ diag_2                  : Factor w/ 6 levels "390-459","Other",..: 1 2 2 1 1 2 4 1 1 2 ...
##  $ diag_3                  : Factor w/ 6 levels "390-459","Other",..: 2 5 4 2 4 2 4 1 2 6 ...
##  $ metformin               : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 3 2 3 2 ...
##  $ glimepiride             : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 3 2 ...
##  $ glipizide               : Factor w/ 4 levels "Down","No","Steady",..: 3 2 3 2 2 2 2 2 2 2 ...
##  $ glyburide               : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 4 2 2 2 2 ...
##  $ pioglitazone            : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ rosiglitazone           : Factor w/ 4 levels "Down","No","Steady",..: 2 3 2 2 2 2 2 2 2 2 ...
##  $ insulin                 : Factor w/ 4 levels "Down","No","Steady",..: 3 3 3 3 3 3 1 3 3 3 ...
##  $ change                  : Factor w/ 2 levels "Ch","No": 1 1 1 2 2 1 1 2 1 2 ...
##  $ diabetesMed             : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ readmitted              : Factor w/ 2 levels "<30","Not<30": 2 2 2 2 2 1 1 2 2 2 ...
#write.csv(db.expl, '/Users/amyhowe/Desktop/Dataset_Exploration.csv', row.names = F) # Export dataset if desired

Bivariate Analysis

Correlation of Numeric and Ordinal Attributes

Let’s take a look at the correlations of numeric and ordinal attributes. Given that there will be one ordinal attribute (Age) to compare in the mix, as well as the fact that there are numerous outliers in the numeric data, we will be using Spearman’s correlation which uses ranks.

rcorr(as.matrix(cbind(db.expl[,c(1:8)], age = as.numeric(db.expl$age))), type = 'spearman') # No strong monotonic correlations. There are moderate correlations between number of medications and time in hospital (0.46), number of medications and number of procedures (0.37), and number of lab procedures and time in hospital (0.35). The rest are weak correlations. There is 0 correlation between time in hospital and number of emergency visits, but this is NOT statistically significant (p = 0.283). The same applies for number of procedures and number of lab procedures (p = 0.309). The rest of the findings are highly significant (p<0.01)
##                    time_in_hospital num_lab_procedures num_procedures
## time_in_hospital               1.00               0.35           0.18
## num_lab_procedures             0.35               1.00           0.00
## num_procedures                 0.18               0.00           1.00
## num_medications                0.46               0.24           0.37
## number_outpatient             -0.02              -0.02          -0.01
## number_emergency               0.00               0.02          -0.04
## number_inpatient               0.07               0.08          -0.01
## number_diagnoses               0.25               0.17           0.07
## age                            0.14               0.03          -0.05
##                    num_medications number_outpatient number_emergency
## time_in_hospital              0.46             -0.02             0.00
## num_lab_procedures            0.24             -0.02             0.02
## num_procedures                0.37             -0.01            -0.04
## num_medications               1.00              0.05             0.02
## number_outpatient             0.05              1.00             0.15
## number_emergency              0.02              0.15             1.00
## number_inpatient              0.06              0.10             0.14
## number_diagnoses              0.29              0.09             0.07
## age                           0.04              0.03            -0.03
##                    number_inpatient number_diagnoses   age
## time_in_hospital               0.07             0.25  0.14
## num_lab_procedures             0.08             0.17  0.03
## num_procedures                -0.01             0.07 -0.05
## num_medications                0.06             0.29  0.04
## number_outpatient              0.10             0.09  0.03
## number_emergency               0.14             0.07 -0.03
## number_inpatient               1.00             0.08  0.03
## number_diagnoses               0.08             1.00  0.21
## age                            0.03             0.21  1.00
## 
## n= 69973 
## 
## 
## P
##                    time_in_hospital num_lab_procedures num_procedures
## time_in_hospital                    0.0000             0.0000        
## num_lab_procedures 0.0000                              0.3090        
## num_procedures     0.0000           0.3090                           
## num_medications    0.0000           0.0000             0.0000        
## number_outpatient  0.0000           0.0000             0.0012        
## number_emergency   0.2831           0.0000             0.0000        
## number_inpatient   0.0000           0.0000             0.0160        
## number_diagnoses   0.0000           0.0000             0.0000        
## age                0.0000           0.0000             0.0000        
##                    num_medications number_outpatient number_emergency
## time_in_hospital   0.0000          0.0000            0.2831          
## num_lab_procedures 0.0000          0.0000            0.0000          
## num_procedures     0.0000          0.0012            0.0000          
## num_medications                    0.0000            0.0000          
## number_outpatient  0.0000                            0.0000          
## number_emergency   0.0000          0.0000                            
## number_inpatient   0.0000          0.0000            0.0000          
## number_diagnoses   0.0000          0.0000            0.0000          
## age                0.0000          0.0000            0.0000          
##                    number_inpatient number_diagnoses age   
## time_in_hospital   0.0000           0.0000           0.0000
## num_lab_procedures 0.0000           0.0000           0.0000
## num_procedures     0.0160           0.0000           0.0000
## num_medications    0.0000           0.0000           0.0000
## number_outpatient  0.0000           0.0000           0.0000
## number_emergency   0.0000           0.0000           0.0000
## number_inpatient                    0.0000           0.0000
## number_diagnoses   0.0000                            0.0000
## age                0.0000           0.0000
corr <- round(cor(cbind(db.expl[,c(1:8)], age = as.numeric(db.expl$age)), method = 'spearman'), 2)
ggcorrplot(corr, hc.order = TRUE, type = "lower", lab = TRUE) # Visualize the correlation heatmap

Scatterplots for Numeric Variables

Let’s take a look at the scatterplots for each combination of numeric variables.

pairs(db.expl[,1:8], pch = 20, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.1))

Let’s focus in on the scatterplots for the three combinations we found to have the highest Spearman’s r correlation coefficient.

plot(jitter(db.expl$time_in_hospital,2), db.expl$num_medications, pch=20, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.1), xlab = 'Time in Hospital', ylab = 'No. Medications') # Added transparency, as well as jitter along the x-axis to better see the trend. As the time in hospital increases, the number of medications appears to increase at a linear rate. However, there appears to be a lot of noise as well

plot(jitter(db.expl$num_procedures,2), db.expl$num_medications, pch=20, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.1), xlab = 'No. Procedures', ylab = 'No. Medications') # Jitter added along the x-axis as well. It appears as if as the number of procedures increase, the number of medications increase as well at a linear rate. However, there is more noise as the number of procedures increases as well

plot(jitter(db.expl$time_in_hospital,2), db.expl$num_lab_procedures, pch=20, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.1), xlab = 'Time in Hospital', ylab = 'No. Lab Procedures') # Jitter present for x-axis here too. As the time in hospital increases, the number of lab procedures increase. However, the relationship may be modeled better with a curve than with a line.

Variables by Readmission Status

We will compare each variable against the target variable (readmitted) to identify differences between the two groups: patients who were readmitted within 30 days, and patients who weren’t.

Numeric Variables by Readmission Status

For numeric variables, the count, mean, standard deviation, and boxplots will be compared between both groups. Additionally, a sample t-test will be completed to test for a statistically significant difference between the two groups.

Time in Hospital VS Readmitted

group_by(db.expl, readmitted) %>%
  summarise(count = n(), mean = mean(time_in_hospital), sd = sd(time_in_hospital)) # The mean for time in hospital for the <30 group is slightly higher than the not readmitted group, with a slightly higher standard deviation as well.
## # A tibble: 2 x 4
##   readmitted count  mean    sd
##   <fct>      <int> <dbl> <dbl>
## 1 <30         6277  4.80  3.06
## 2 Not<30     63696  4.22  2.92
boxplot(time_in_hospital~readmitted, data=db.expl, horizontal = T, col = '#83afe6', xlab = "Time in Hospital", ylab = "Readmission Status") # Visualize the two groups

# Test whether the mean of the <30 group is significantly higher than the Not<30 group; no need to test for normality as sample sizes are large (central limit theorem). First check to see if the variances are equal using an f-test to inform the t-test arguments
var.test(time_in_hospital ~ readmitted, data = db.expl) # Variances are not equal (p=2.644e-07)
## 
##  F test to compare two variances
## 
## data:  time_in_hospital by readmitted
## F = 1.0996, num df = 6276, denom df = 63695, p-value = 2.644e-07
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  1.060322 1.141018
## sample estimates:
## ratio of variances 
##           1.099623
t.test(db.expl$time_in_hospital[db.expl$readmitted == "<30"], db.expl$time_in_hospital[db.expl$readmitted == "Not<30"], alternative = "greater", var.equal = FALSE) # The finding that the mean for the <30 group is higher than the Not<30 group is highly significant (p<2.2e-16). Therefore, there is a longer average length of stay for patients who are readmitted within 30 days
## 
##  Welch Two Sample t-test
## 
## data:  db.expl$time_in_hospital[db.expl$readmitted == "<30"] and db.expl$time_in_hospital[db.expl$readmitted == "Not<30"]
## t = 14.286, df = 7445.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  0.5093295       Inf
## sample estimates:
## mean of x mean of y 
##  4.797196  4.221584

Number of Lab Procedures VS Readmitted

group_by(db.expl, readmitted) %>%
  summarise(count = n(), mean = mean(num_lab_procedures), sd = sd(num_lab_procedures)) # The mean for number of lab procedures for the <30 group is slightly higher than the not readmitted group, with a slightly lower standard deviation
## # A tibble: 2 x 4
##   readmitted count  mean    sd
##   <fct>      <int> <dbl> <dbl>
## 1 <30         6277  44.9  19.3
## 2 Not<30     63696  42.7  19.9
boxplot(num_lab_procedures~readmitted, data=db.expl, horizontal = T, col = '#83afe6', xlab = "Number of Lab Procedures", ylab = "Readmission Status")

var.test(num_lab_procedures ~ readmitted, data = db.expl) # Variances of the two groups are not equal (p=0.001259)
## 
##  F test to compare two variances
## 
## data:  num_lab_procedures by readmitted
## F = 0.94087, num df = 6276, denom df = 63695, p-value = 0.001259
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.9072466 0.9762929
## sample estimates:
## ratio of variances 
##          0.9408736
t.test(db.expl$num_lab_procedures[db.expl$readmitted == "<30"], db.expl$num_lab_procedures[db.expl$readmitted == "Not<30"], alternative = "greater", var.equal = FALSE) # The finding that the mean for the <30 group is higher than the Not<30 group is highly significant (p<2.2e-16). Therefore, there is a higher average number of lab procedures completed for patients who are readmitted within 30 days
## 
##  Welch Two Sample t-test
## 
## data:  db.expl$num_lab_procedures[db.expl$readmitted == "<30"] and db.expl$num_lab_procedures[db.expl$readmitted == "Not<30"]
## t = 8.7323, df = 7651.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  1.818288      Inf
## sample estimates:
## mean of x mean of y 
##  44.91541  42.67507

Number of Procedures VS Readmitted

group_by(db.expl, readmitted) %>%
  summarise(count = n(), mean = mean(num_procedures), sd = sd(num_procedures)) # The means and standard deviations are very close for both groups
## # A tibble: 2 x 4
##   readmitted count  mean    sd
##   <fct>      <int> <dbl> <dbl>
## 1 <30         6277  1.42  1.73
## 2 Not<30     63696  1.43  1.76
boxplot(num_procedures~readmitted, data=db.expl, horizontal = T, col = '#83afe6', xlab = "Number of Procedures", ylab = "Readmission Status") # Distributions look identical

var.test(num_procedures ~ readmitted, data = db.expl) # Fail to reject the null hypothesis that the variances are equal (p=0.07307)
## 
##  F test to compare two variances
## 
## data:  num_procedures by readmitted
## F = 0.96678, num df = 6276, denom df = 63695, p-value = 0.07307
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.9322232 1.0031703
## sample estimates:
## ratio of variances 
##          0.9667759
t.test(db.expl$num_procedures[db.expl$readmitted == "<30"], db.expl$num_procedures[db.expl$readmitted == "Not<30"], alternative = 'less' , var.equal = TRUE) # Fail to reject the null hypothesis that the means are the same (p=0.4847)
## 
##  Two Sample t-test
## 
## data:  db.expl$num_procedures[db.expl$readmitted == "<30"] and db.expl$num_procedures[db.expl$readmitted == "Not<30"]
## t = -0.038297, df = 69971, p-value = 0.4847
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##        -Inf 0.03734585
## sample estimates:
## mean of x mean of y 
##  1.424725  1.425615

Number of Medications VS Readmitted

group_by(db.expl, readmitted) %>%
  summarise(count = n(), mean = mean(num_medications), sd = sd(num_medications)) # The mean and standard deviation of the <30 group is slightly higher
## # A tibble: 2 x 4
##   readmitted count  mean    sd
##   <fct>      <int> <dbl> <dbl>
## 1 <30         6277  16.6  8.32
## 2 Not<30     63696  15.6  8.28
boxplot(num_medications~readmitted, data=db.expl, horizontal = T, col = '#83afe6', xlab = "Number of Medications", ylab = "Readmission Status") # Distributions look similar

var.test(num_medications ~ readmitted, data = db.expl) # Fail to reject the null hypothesis that the variances are equal (p=0.5465)
## 
##  F test to compare two variances
## 
## data:  num_medications by readmitted
## F = 1.0112, num df = 6276, denom df = 63695, p-value = 0.5465
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.9750895 1.0492990
## sample estimates:
## ratio of variances 
##           1.011231
t.test(db.expl$num_medications[db.expl$readmitted == "<30"], db.expl$num_medications[db.expl$readmitted == "Not<30"], alternative = "greater", var.equal = TRUE)  # The results are highly significant (p<2.2e-16) that the mean of the <30 group is greater than the Not<30 group. Therefore, the average number of medications for those readmitted within 30 days is higher than those not readmitted within 30 days
## 
##  Two Sample t-test
## 
## data:  db.expl$num_medications[db.expl$readmitted == "<30"] and db.expl$num_medications[db.expl$readmitted == "Not<30"]
## t = 9.6309, df = 69971, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  0.8749603       Inf
## sample estimates:
## mean of x mean of y 
##  16.62578  15.57060

Number of Outpatient Visits VS Readmitted

group_by(db.expl, readmitted) %>%
  summarise(count = n(), mean = mean(number_outpatient), sd = sd(number_outpatient)) # The mean of the <30 group is slightly higher, while the sd is slightly lower
## # A tibble: 2 x 4
##   readmitted count  mean    sd
##   <fct>      <int> <dbl> <dbl>
## 1 <30         6277 0.308  1.04
## 2 Not<30     63696 0.277  1.07
boxplot(number_outpatient~readmitted, data=db.expl, horizontal = T, col = '#83afe6', xlab = "Number of Outpatient Visits", ylab = "Readmission Status") # Distributions look similar, with fewer outliers for <30

var.test(number_outpatient ~ readmitted, data = db.expl) # Can reject the null hypothesis at the alpha=0.05 level that the variances are equal (p=0.03272)
## 
##  F test to compare two variances
## 
## data:  number_outpatient by readmitted
## F = 0.96053, num df = 6276, denom df = 63695, p-value = 0.03272
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.9261983 0.9966869
## sample estimates:
## ratio of variances 
##          0.9605277
t.test(db.expl$number_outpatient[db.expl$readmitted == "<30"], db.expl$number_outpatient[db.expl$readmitted == "Not<30"], alternative = "greater", var.equal = FALSE)  # The results are statistically significant (p=0.01096) that the mean of the <30 group is greater than the Not<30 group. Therefore, the average number of outpatient visits in the last year for those readmitted within 30 days is higher than those not readmitted within 30 days.
## 
##  Welch Two Sample t-test
## 
## data:  db.expl$number_outpatient[db.expl$readmitted == "<30"] and db.expl$number_outpatient[db.expl$readmitted == "Not<30"]
## t = 2.2924, df = 7621.9, p-value = 0.01096
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  0.008962153         Inf
## sample estimates:
## mean of x mean of y 
## 0.3084276 0.2766893

Number of Emergency Visits VS Readmitted

group_by(db.expl, readmitted) %>%
  summarise(count = n(), mean = mean(number_emergency), sd = sd(number_emergency)) # The mean and standard deviation of the <30 group are higher
## # A tibble: 2 x 4
##   readmitted count   mean    sd
##   <fct>      <int>  <dbl> <dbl>
## 1 <30         6277 0.150  0.582
## 2 Not<30     63696 0.0994 0.504
boxplot(number_emergency~readmitted, data=db.expl, horizontal = T, col = '#83afe6', xlab = "Number of Emergency Visits", ylab = "Readmission Status") # Distributions look similar, with fewer outliers for <30

var.test(number_emergency ~ readmitted, data = db.expl) # Cannot consider the variances to be equal (p<2.2e-16)
## 
##  F test to compare two variances
## 
## data:  number_emergency by readmitted
## F = 1.3333, num df = 6276, denom df = 63695, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  1.285695 1.383543
## sample estimates:
## ratio of variances 
##           1.333349
t.test(db.expl$number_emergency[db.expl$readmitted == "<30"], db.expl$number_emergency[db.expl$readmitted == "Not<30"], alternative = "greater", var.equal = FALSE)  # The results are highly significant (p=2.016e-11) that the mean of the <30 group is greater than the Not<30 group. Therefore, the average number of emergency visits in the last year for those readmitted within 30 days is higher than those not readmitted within 30 days
## 
##  Welch Two Sample t-test
## 
## data:  db.expl$number_emergency[db.expl$readmitted == "<30"] and db.expl$number_emergency[db.expl$readmitted == "Not<30"]
## t = 6.6131, df = 7234.1, p-value = 2.016e-11
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  0.0378318       Inf
## sample estimates:
## mean of x mean of y 
## 0.1497531 0.0993940

Number of Inpatient Visits VS Readmitted

group_by(db.expl, readmitted) %>%
  summarise(count = n(), mean = mean(number_inpatient), sd = sd(number_inpatient)) # The mean of the <30 group is over double the mean of Not<30. The sd is higher as well
## # A tibble: 2 x 4
##   readmitted count  mean    sd
##   <fct>      <int> <dbl> <dbl>
## 1 <30         6277 0.369 0.983
## 2 Not<30     63696 0.157 0.546
boxplot(number_inpatient~readmitted, data=db.expl, horizontal = T, col = '#83afe6', xlab = "Number of Inpatient Visits", ylab = "Readmission Status") # We're not able to see much here

var.test(number_inpatient ~ readmitted, data = db.expl) # Variances are not equal (p<2.2e-16)
## 
##  F test to compare two variances
## 
## data:  number_inpatient by readmitted
## F = 3.2355, num df = 6276, denom df = 63695, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  3.119891 3.357331
## sample estimates:
## ratio of variances 
##           3.235529
t.test(db.expl$number_inpatient[db.expl$readmitted == "<30"], db.expl$number_inpatient[db.expl$readmitted == "Not<30"], alternative = "greater", var.equal = FALSE)  # The findings are highly significant (p<2.2e-16) that the mean of the <30 group is greater than the Not<30 group. Therefore, the average number of inpatient visits in the last year for those readmitted within 30 days is higher than those not readmitted within 30 days
## 
##  Welch Two Sample t-test
## 
## data:  db.expl$number_inpatient[db.expl$readmitted == "<30"] and db.expl$number_inpatient[db.expl$readmitted == "Not<30"]
## t = 16.799, df = 6663.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  0.1908144       Inf
## sample estimates:
## mean of x mean of y 
## 0.3688068 0.1572783

Number of Diagnoses VS Readmitted

group_by(db.expl, readmitted) %>%
  summarise(count = n(), mean = mean(number_diagnoses), sd = sd(number_diagnoses)) # The mean of the <30 group is higher than the Not<30 group while the sd is lower
## # A tibble: 2 x 4
##   readmitted count  mean    sd
##   <fct>      <int> <dbl> <dbl>
## 1 <30         6277  7.51  1.85
## 2 Not<30     63696  7.20  2.01
boxplot(number_diagnoses~readmitted, data=db.expl, horizontal = T, col = '#83afe6', xlab = "Number of Diagnoses", ylab = "Readmission Status") # The distributions appear identical here

var.test(number_diagnoses ~ readmitted, data = db.expl) # Variances are not equal (p<2.2e-16)
## 
##  F test to compare two variances
## 
## data:  number_diagnoses by readmitted
## F = 0.84113, num df = 6276, denom df = 63695, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.8110711 0.8727979
## sample estimates:
## ratio of variances 
##          0.8411333
t.test(db.expl$number_diagnoses[db.expl$readmitted == "<30"], db.expl$number_diagnoses[db.expl$readmitted == "Not<30"], alternative = "greater", var.equal = FALSE)  # Highly significant (p<2.2e-16) that the mean of the <30 group is greater than the Not<30 group. Therefore, the average number of diagnoses for those readmitted within 30 days is higher than those not readmitted within 30 days
## 
##  Welch Two Sample t-test
## 
## data:  db.expl$number_diagnoses[db.expl$readmitted == "<30"] and db.expl$number_diagnoses[db.expl$readmitted == "Not<30"]
## t = 12.903, df = 7822.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  0.277388      Inf
## sample estimates:
## mean of x mean of y 
##  7.513143  7.195224

Ordinal Variable by Readmission Status

For the one remaining ordinal variable, age, the frequencies and histograms of each group will be reviewed, and a wilcoxon rank sum test will be completed to identify if there is a statistically significant difference between the two groups.

Age VS Readmitted

table(db.expl$age, db.expl$readmitted) # Values appear to peak similarly in the 70-80 range
##           
##              <30 Not<30
##   [0-10)       3    150
##   [10-20)     26    508
##   [20-30)     83   1038
##   [30-40)    188   2504
##   [40-50)    506   6322
##   [50-60)    879  11470
##   [60-70)   1412  14272
##   [70-80)   1817  15933
##   [80-90)   1195   9907
##   [90-100)   168   1592
group_by(db.expl, readmitted) %>%
  summarise(count = n(), median = median(as.numeric(age))) # The median of the <30 group is 8 ([70-80)), while the median of the Not <30 group is 7 ([60-70)) 
## # A tibble: 2 x 3
##   readmitted count median
##   <fct>      <int>  <dbl>
## 1 <30         6277      8
## 2 Not<30     63696      7
ggplot(subset(db.expl, db.expl$readmitted == "<30"), aes(age)) + geom_bar(color = 'black', fill = '#83afe6') + theme_bw() + labs(x = 'Age in Readmitted Group', y = 'Count')

ggplot(subset(db.expl, db.expl$readmitted == "Not<30"), aes(age)) + geom_bar(color = 'black', fill = '#83afe6') + theme_bw() + labs(x = 'Age in Not Readmitted Group', y = 'Count') # The distribution of the readmitted in <30 days group appears to have a bit more weight to the right than the other group

age.numeric <- as.numeric(db.expl$age) # Convert Age to its ordered IDs (1-10) so that it may be used with the wilcoxon rank sum test function
wilcox.test(age.numeric~db.expl$readmitted, alternative = 'greater', paired = F) # Findings are highly significant (p<2.2e-16) that the center of the <30 group is greater than the center of the Not<30 group. Therefore, patients who have been readmitted within 30 days are generally older
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  age.numeric by db.expl$readmitted
## W = 219034815, p-value < 2.2e-16
## alternative hypothesis: true location shift is greater than 0

Categorical Variables by Readmission Status

First we will define a custom function to be used to calculate the percent contribution of each cell of a table to the chi-squared test statistic to identify combinations of categories with the highest contributions.

percent_contrib <- function(x) {
  y <- 100*x$residuals^2/x$statistic
  return(round(y, 3))
}

For each categorical variable, we will review the frequencies for each readmission group, complete a chi-squared test to identify if the groups have a statistically significant difference, and then, if the findings are statistically significant, review the percent contribution and residuals of cells to identify associations between categories of the variables.

Race VS Readmitted

table(db.expl$race, db.expl$readmitted)
##                  
##                     <30 Not<30
##   Caucasian        4942  49268
##   AfricanAmerican  1093  11532
##   Asian              41    447
##   Hispanic          122   1378
##   Other              79   1071
race_readm.chisq <- chisq.test(table(db.expl$race, db.expl$readmitted))
race_readm.chisq # The results are statistically significant at the alpha = 0.05 level (p=0.0311), indicating that there is a statistically significant association between the two variables (i.e., they are not independent)
## 
##  Pearson's Chi-squared test
## 
## data:  table(db.expl$race, db.expl$readmitted)
## X-squared = 10.625, df = 4, p-value = 0.03111
percent_contrib(race_readm.chisq) # The percent contribution (100*residual^2/chi-squared) indicates that the number of patients who identified their race as "Other" and were readmitted within 30 days contributed most to the total chi-square score (53.3%). Caucasian (12.1%), African American (13.0%), and Hispanic (11.0%) with <30 each contributed similarly.
##                  
##                      <30 Not<30
##   Caucasian       12.089  1.191
##   AfricanAmerican 12.991  1.280
##   Asian            1.657  0.163
##   Hispanic        11.032  1.087
##   Other           53.260  5.249
round(race_readm.chisq$residuals,2) # We can see that since the residual is negative for the Other, <30 cell, there were fewer people observed than expected, indicating a negative association. Otherwise, we can see from the residuals that there was a positive association for Caucasian and <30, and a negative association for Aftrican American and Hispanic with <30. I.e., more Caucasian patients were readmitted within 30 days than expected, and fewer African American, Hispanic, and "Other" patients were. This could make sense given that in the U.S., ethnic minorities are more likely to be of a lower socioeconomic status, and therefore are less likely to have health insurance and feel comfortable seeking care
##                  
##                     <30 Not<30
##   Caucasian        1.13  -0.36
##   AfricanAmerican -1.17   0.37
##   Asian           -0.42   0.13
##   Hispanic        -1.08   0.34
##   Other           -2.38   0.75

Gender VS Readmitted

table(db.expl$gender, db.expl$readmitted)
##         
##            <30 Not<30
##   Female  3361  33871
##   Male    2916  29825
chisq.test(table(db.expl$gender, db.expl$readmitted)) # The chi-squared test is not statistically significant (p=0.5856), indicating that we cannot reject the null hypothesis that the two variables are independent
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(db.expl$gender, db.expl$readmitted)
## X-squared = 0.29729, df = 1, p-value = 0.5856

Admission Type ID VS Readmitted

table(db.expl$admission_type_id, db.expl$readmitted)
##            
##               <30 Not<30
##   Emergency  3987  39372
##   Urgent     1148  11654
##   Elective   1141  12644
##   Other         1     26
admtype_readm.chisq <- chisq.test(table(db.expl$admission_type_id, db.expl$readmitted))
admtype_readm.chisq # The results are highly significant (p=0.008417), indicating that there is an association between the two variables
## 
##  Pearson's Chi-squared test
## 
## data:  table(db.expl$admission_type_id, db.expl$readmitted)
## X-squared = 11.717, df = 3, p-value = 0.008417
percent_contrib(admtype_readm.chisq) # The percent contribution indicates that the number of patients who were readmitted within 30 days for elective reasons contributed the most (63.1%), with those readmitted within 30 days through emergency contributing the next most (20.8%)
##            
##                <30 Not<30
##   Emergency 20.831  2.053
##   Urgent     0.001  0.000
##   Elective  63.072  6.215
##   Other      7.126  0.702
round(admtype_readm.chisq$residuals,2) # The residual is negative for <30, Elective, indicating that the number seen is less than expected. The residual is positive for <30, Emergency, indicating more people were admitted through emergency than expected. These findings suggest that if you are admitted to hospital for planned reasons, you are less likely to be readmitted within 30 days
##            
##               <30 Not<30
##   Emergency  1.56  -0.49
##   Urgent    -0.01   0.00
##   Elective  -2.72   0.85
##   Other     -0.91   0.29

Discharge Disposition ID VS Readmitted

table(db.expl$discharge_disposition_id, db.expl$readmitted)
##                        
##                           <30 Not<30
##   DcHome                 3377  44192
##   DcOtherFacility         868   3919
##   DcSNF                  1176   7608
##   DcHomeWithHomeService   796   7566
##   Other                    60    411
dcid_readm.chisq <- chisq.test(table(db.expl$discharge_disposition_id, db.expl$readmitted))
dcid_readm.chisq # The results are highly significant (p<2.2e-16), indicating that there is an association between the two variables
## 
##  Pearson's Chi-squared test
## 
## data:  table(db.expl$discharge_disposition_id, db.expl$readmitted)
## X-squared = 917.26, df = 4, p-value < 2.2e-16
percent_contrib(dcid_readm.chisq) # DcOtherFacility, <30 has the highest percent contribution (48.8%) with DcSNF (20.8%) and DcHome (20.3%) following
##                        
##                            <30 Not<30
##   DcHome                20.247  1.995
##   DcOtherFacility       48.833  4.812
##   DcSNF                 20.831  2.053
##   DcHomeWithHomeService  0.306  0.030
##   Other                  0.813  0.080
round(dcid_readm.chisq$residuals,2) # There is a negative association between DcHome and <30, and a positive association between each DcOtherFacility and DcSNF with <30. This indicates that in the <30 days readmitted group, there were fewer people than expected discharged home, with more people than expected discharged to a Skilled Nursing Facility or other facility
##                        
##                            <30 Not<30
##   DcHome                -13.63   4.28
##   DcOtherFacility        21.16  -6.64
##   DcSNF                  13.82  -4.34
##   DcHomeWithHomeService   1.68  -0.53
##   Other                   2.73  -0.86

Admission Source ID VS Readmitted

table(db.expl$admission_source_id, db.expl$readmitted)
##                      
##                         <30 Not<30
##   PhysRef              1871  19875
##   TransferExtFacility   508   5372
##   ER                   3897  38431
##   Other                   1     18
admsource_readm.chisq <- chisq.test(table(db.expl$admission_source_id, db.expl$readmitted))
admsource_readm.chisq # The results are not statistically significant at the alpha = 0.05 level (p=0.0555), although they are tending towards significance
## 
##  Pearson's Chi-squared test
## 
## data:  table(db.expl$admission_source_id, db.expl$readmitted)
## X-squared = 7.5795, df = 3, p-value = 0.05555

Diagnosis 1 VS Readmitted

table(db.expl$diag_1, db.expl$readmitted)
##              
##                 <30 Not<30
##   390-459      2067  19259
##   Other        1457  14410
##   250.0-250.9   524   5224
##   460-519       536   5910
##   520-579       502   5823
##   710-739       341   3723
##   780-799       344   5159
##   800-999       506   4188
d1_readm.chisq <- chisq.test(table(db.expl$diag_1, db.expl$readmitted))
d1_readm.chisq # The results are highly significant (p<2.2e-16), indicating that there is an association between the two variables
## 
##  Pearson's Chi-squared test
## 
## data:  table(db.expl$diag_1, db.expl$readmitted)
## X-squared = 96.623, df = 7, p-value < 2.2e-16
percent_contrib(d1_readm.chisq) # 780-799 (Symptoms, Signs, And Ill-Defined Conditions), <30 has the highest percent contribution (47.0%), with 800-999 (Injury And Poisoning), <30 (17.7%) and 390-459 (Diseases Of The Circulatory System), <30 (12.8%) following
##              
##                  <30 Not<30
##   390-459     12.818  1.263
##   Other        0.823  0.081
##   250.0-250.9  0.141  0.014
##   460-519      3.194  0.315
##   520-579      7.800  0.769
##   710-739      1.576  0.155
##   780-799     46.953  4.627
##   800-999     17.724  1.747
round(d1_readm.chisq$residuals,2) # There is a negative association between 780-799 and <30, and a positive association for each 800-999 and 390-459 with <30. Therefore, in the readmitted within 30 days group, there are fewer patients than expected with a primary diagnosis of Symptoms, Signs, And Ill-Defined Conditions, and more patients than expected with Injury and Poisoning, or a circulatory disorder
##              
##                 <30 Not<30
##   390-459      3.52  -1.10
##   Other        0.89  -0.28
##   250.0-250.9  0.37  -0.12
##   460-519     -1.76   0.55
##   520-579     -2.75   0.86
##   710-739     -1.23   0.39
##   780-799     -6.74   2.11
##   800-999      4.14  -1.30

Diagnosis 2 VS Readmitted

table(db.expl$diag_2, db.expl$readmitted)
##                  
##                     <30 Not<30
##   390-459          2020  20055
##   Other            1965  19141
##   240-279(not250)   456   5149
##   250.0-250.9       827   8873
##   460-519           555   5890
##   580-629           454   4588
d2_readm.chisq <- chisq.test(table(db.expl$diag_2, db.expl$readmitted))
d2_readm.chisq # The results are statistically significant at the alpha = 0.05 level (p=0.03454), indicating that there is an association between the two variables
## 
##  Pearson's Chi-squared test
## 
## data:  table(db.expl$diag_2, db.expl$readmitted)
## X-squared = 12.018, df = 5, p-value = 0.03454
percent_contrib(d2_readm.chisq) # 240-279(not250) (Endocrine, Nutritional And Metabolic Diseases, And Immunity Disorders) with <30 has the highest percent contribution (36.3%), with Other, <30 (22.6%) and 250.0-250.0 (Diabetes Mellitus), <30 (17.8%) following
##                  
##                      <30 Not<30
##   390-459          6.636  0.654
##   Other           22.571  2.224
##   240-279(not250) 36.250  3.572
##   250.0-250.9     17.803  1.754
##   460-519          7.717  0.760
##   580-629          0.053  0.005
round(d2_readm.chisq$residuals,2) # There are fewer patients than expected in those who are readmitted with a secondary diagnosis of Diabetes Mellitus or Endocrine, Nutritional And Metabolic Diseases, And Immunity Disorders, and more than expected with "Other"
##                  
##                     <30 Not<30
##   390-459          0.89  -0.28
##   Other            1.65  -0.52
##   240-279(not250) -2.09   0.66
##   250.0-250.9     -1.46   0.46
##   460-519         -0.96   0.30
##   580-629          0.08  -0.03

Diagnosis 3 VS Readmitted

table(db.expl$diag_3, db.expl$readmitted)
##                  
##                     <30 Not<30
##   390-459          1899  19949
##   Other            1979  19167
##   240-279(not250)   502   5901
##   250.0-250.9      1079  11467
##   460-519           444   3801
##   580-629           374   3411
d3_readm.chisq <- chisq.test(table(db.expl$diag_3, db.expl$readmitted))
d3_readm.chisq # The results are highly significant (p=3.032e-06), indicating that there is an association between the two variables
## 
##  Pearson's Chi-squared test
## 
## data:  table(db.expl$diag_3, db.expl$readmitted)
## X-squared = 33.472, df = 5, p-value = 3.032e-06
percent_contrib(d3_readm.chisq) # 460-519 (Diseases Of The Respiratory System) with <30 has the highest percent contribution (31.3%), with 240-279(not250) (Endocrine, Nutritional And Metabolic Diseases, And Immunity Disorders), <30 (27.3%) following 
##                  
##                      <30 Not<30
##   390-459          5.653  0.557
##   Other           10.610  1.046
##   240-279(not250) 27.254  2.686
##   250.0-250.9      5.728  0.564
##   460-519         31.334  3.088
##   580-629         10.450  1.030
round(d3_readm.chisq$residuals,2) # There are more patients than expected in those who are readmitted with a tertiary diagnosis of Diseases Of The Respiratory System, and fewer than expected with Endocrine, Nutritional And Metabolic Diseases, And Immunity Disorders
##                  
##                     <30 Not<30
##   390-459         -1.38   0.43
##   Other            1.88  -0.59
##   240-279(not250) -3.02   0.95
##   250.0-250.9     -1.38   0.43
##   460-519          3.24  -1.02
##   580-629          1.87  -0.59

A1C Result VS Readmitted

table(db.expl$A1Cresult, db.expl$readmitted)
##       
##          <30 Not<30
##   >7     755   8349
##   None  5199  51929
##   Norm   323   3418
A1C_readm.chisq <- chisq.test(table(db.expl$A1Cresult, db.expl$readmitted))
A1C_readm.chisq # The results are statistically significant at the alpha = 0.05 level (p=0.03305), indicating that there is an association between the two variables
## 
##  Pearson's Chi-squared test
## 
## data:  table(db.expl$A1Cresult, db.expl$readmitted)
## X-squared = 6.8195, df = 2, p-value = 0.03305
percent_contrib(A1C_readm.chisq) # >7 (abnormal HbA1c result) and <30 has the highest percent contribution (68.3%)
##       
##           <30 Not<30
##   >7   68.318  6.732
##   None 15.785  1.556
##   Norm  6.926  0.683
round(A1C_readm.chisq$residuals,2) # There is a negative association between >7 and <30, indicating that patients who were readmitted within 30 days were less likely than expected to have an abnormal HbA1c result. This is very interesting. It appears that having an abnormal result may be a protective factor to getting readmitted. This would likely be because as a result of the abnormal finding, some change was made to the patient's treatment regimen leading to better controlled diabetes and a reduction in complications. It is noteworthy however that 81.6% of cases had no HbA1c test done in hospital, so they wouldn't know if the result was normal or abnormal
##       
##          <30 Not<30
##   >7   -2.16   0.68
##   None  1.04  -0.33
##   Norm -0.69   0.22

Metformin VS Readmitted

table(db.expl$metformin, db.expl$readmitted)
##         
##            <30 Not<30
##   Down      44    391
##   No      5040  50030
##   Steady  1135  12499
##   Up        58    776
met_readm.chisq <- chisq.test(table(db.expl$metformin, db.expl$readmitted))
met_readm.chisq # The results are highly significant (p=0.002862), indicating that there is an association between the two variables
## 
##  Pearson's Chi-squared test
## 
## data:  table(db.expl$metformin, db.expl$readmitted)
## X-squared = 14.032, df = 3, p-value = 0.002862
percent_contrib(met_readm.chisq) # Steady and <30 has the highest percent contribution (45.2%) with Up and <30 (26.9%) following
##         
##             <30 Not<30
##   Down    4.525  0.446
##   No     14.394  1.418
##   Steady 45.177  4.452
##   Up     26.933  2.654
round(met_readm.chisq$residuals,2) # There are negative associations between Steady and <30 and Up and <30. This indicates that in the group of patients who are readmitted within 30 days, there are fewer cases than expected where they were on metformin with no change in dosage, and where their metformin dosages were increased.
##         
##            <30 Not<30
##   Down    0.80  -0.25
##   No      1.42  -0.45
##   Steady -2.52   0.79
##   Up     -1.94   0.61

Glimepiride VS Readmitted

table(db.expl$glimepiride, db.expl$readmitted)
##         
##            <30 Not<30
##   Down      18    118
##   No      5953  60323
##   Steady   283   3048
##   Up        23    207
glim_readm.chisq <- chisq.test(table(db.expl$glimepiride, db.expl$readmitted))
glim_readm.chisq # The findings are not statistically significant (p=0.235), indicating that we cannot reject the null hypothesis that the two variables are independent
## 
##  Pearson's Chi-squared test
## 
## data:  table(db.expl$glimepiride, db.expl$readmitted)
## X-squared = 4.2574, df = 3, p-value = 0.235

Glipizide VS Readmitted

table(db.expl$glipizide, db.expl$readmitted)
##         
##            <30 Not<30
##   Down      48    323
##   No      5400  55566
##   Steady   766   7297
##   Up        63    510
glip_readm.chisq <- chisq.test(table(db.expl$glipizide, db.expl$readmitted))
glip_readm.chisq # The results are highly significant (p=0.003262), indicating that there is an association between the two variables
## 
##  Pearson's Chi-squared test
## 
## data:  table(db.expl$glipizide, db.expl$readmitted)
## X-squared = 13.752, df = 3, p-value = 0.003262
percent_contrib(glip_readm.chisq) # Down and <30 has the highest percent contribution (47.3%), with Up, <30 (19.0%) and Steady, <30 (18.3%) following
##         
##             <30 Not<30
##   Down   47.336  4.665
##   No      6.333  0.624
##   Steady 18.330  1.806
##   Up     19.030  1.875
round(glip_readm.chisq$residuals,2) # There are positive associations between each Down, Steady and Up with <30. This indicates there are more people than expected in the readmitted group who are on glipizide with or without a dosage change. It is noteworthy that glipizide, glimepiride and glyburide are all part of the sulfonylurea class of medications, which are associated with increased risk of cardiovascular death as well as episodes of severe hypoglycemia
##         
##            <30 Not<30
##   Down    2.55  -0.80
##   No     -0.93   0.29
##   Steady  1.59  -0.50
##   Up      1.62  -0.51

Glyburide VS Readmitted

table(db.expl$glyburide, db.expl$readmitted)
##         
##            <30 Not<30
##   Down      40    378
##   No      5548  56650
##   Steady   630   6114
##   Up        59    554
gly_readm.chisq <- chisq.test(table(db.expl$glyburide, db.expl$readmitted))
gly_readm.chisq # The results are not statistically significant (p=0.6068), indicating we fail to reject the null hypothesis that the variables are independent
## 
##  Pearson's Chi-squared test
## 
## data:  table(db.expl$glyburide, db.expl$readmitted)
## X-squared = 1.8376, df = 3, p-value = 0.6068

Pioglitazone VS Readmitted

table(db.expl$pioglitazone, db.expl$readmitted)
##         
##            <30 Not<30
##   Down       9     72
##   No      5818  58892
##   Steady   429   4575
##   Up        21    157
pio_readm.chisq <- chisq.test(table(db.expl$pioglitazone, db.expl$readmitted))
pio_readm.chisq # The results are not statistically significant (p=0.3622), indicating we fail to reject the null hypothesis that the variables are independent
## 
##  Pearson's Chi-squared test
## 
## data:  table(db.expl$pioglitazone, db.expl$readmitted)
## X-squared = 3.1974, df = 3, p-value = 0.3622

Rosiglitazone VS Readmitted

table(db.expl$rosiglitazone, db.expl$readmitted)
##         
##            <30 Not<30
##   Down       4     70
##   No      5867  59445
##   Steady   393   4062
##   Up        13    119
rosi_readm.chisq <- chisq.test(table(db.expl$rosiglitazone, db.expl$readmitted))
rosi_readm.chisq # The results are not statistically significant (p=0.7032), indicating we fail to reject the null hypothesis that the variables are independent. Of note, pioglitazone and rosiglitazone come from the same class of medications (glitazones)
## 
##  Pearson's Chi-squared test
## 
## data:  table(db.expl$rosiglitazone, db.expl$readmitted)
## X-squared = 1.41, df = 3, p-value = 0.7032

Insulin VS Readmitted

table(db.expl$insulin, db.expl$readmitted)
##         
##            <30 Not<30
##   Down     772   6549
##   No      2837  31421
##   Steady  2001  19616
##   Up       667   6110
insulin_readm.chisq <- chisq.test(table(db.expl$insulin, db.expl$readmitted))
insulin_readm.chisq # The results are highly significant (p=5.876e-11), indicating that there is an association between the two variables
## 
##  Pearson's Chi-squared test
## 
## data:  table(db.expl$insulin, db.expl$readmitted)
## X-squared = 50.626, df = 3, p-value = 5.876e-11
percent_contrib(insulin_readm.chisq) # Down, <30 has the highest percent contribution (40.0%), with No, <30 (35.8%) following
##         
##             <30 Not<30
##   Down   39.958  3.938
##   No     35.844  3.532
##   Steady  3.893  0.384
##   Up     11.334  1.117
round(insulin_readm.chisq$residuals,2) # There is a positive association between Down and <30, and a negative association between No and <30. This indicates that patients who were readmitted had more cases than expected where they were taking insulin and the dosage was decreased, and fewer patients who were not on insulin than expected. Insulin is a medication requiring close daily monitoring of blood sugar levels. A reduction in dosage without proper follow-up and monitoring could lead to hyperglycemia and eventually hospitalization for complications (e.g., diabetic ketoacidosis)
##         
##            <30 Not<30
##   Down    4.50  -1.41
##   No     -4.26   1.34
##   Steady  1.40  -0.44
##   Up      2.40  -0.75

Change VS Readmitted

table(db.expl$change, db.expl$readmitted)
##     
##        <30 Not<30
##   Ch  2971  28520
##   No  3306  35176
ch_readm.chisq <- chisq.test(table(db.expl$change, db.expl$readmitted))
ch_readm.chisq # The results are highly significant (p=0.0001085), indicating that there is an association between the two variables
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(db.expl$change, db.expl$readmitted)
## X-squared = 14.983, df = 1, p-value = 0.0001085
percent_contrib(ch_readm.chisq) # Ch, <30 has the highest percent contribution (50.4%), with No, <30 (41.3%) following
##     
##         <30 Not<30
##   Ch 50.407  4.967
##   No 41.249  4.065
round(ch_readm.chisq$residuals,2) # There is a positive association between Ch and <30, and a negative association between No and <30. This indicates that in the <30 day readmitted group, there were significantly more people than expected who had a change in diabetic medications in hospital, and fewer than expected who did not have a change. One reason for this could be that there may not be adequate follow-up relating to changes in diabetic medications, leading to complications and re-hospitalization
##     
##        <30 Not<30
##   Ch  2.75  -0.86
##   No -2.49   0.78

Diabetes Meds VS Readmitted

table(db.expl$diabetesMed, db.expl$readmitted)
##      
##         <30 Not<30
##   No   1259  15421
##   Yes  5018  48275
dbm_readm.chisq <- chisq.test(table(db.expl$diabetesMed, db.expl$readmitted))
dbm_readm.chisq # The results are highly significant (p=1.953e-13), indicating that there is an association between the two variables
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(db.expl$diabetesMed, db.expl$readmitted)
## X-squared = 54.052, df = 1, p-value = 1.953e-13
percent_contrib(dbm_readm.chisq) # No, <30 has the highest percent contribution (69.6%), with Yes, <30 (22.0%) following
##      
##          <30 Not<30
##   No  69.623  6.861
##   Yes 21.791  2.147
round(dbm_readm.chisq$residuals,2) # There is a negative association between No diabetes meds and <30, with a positive association between Yes diabetes meds and <30. This indicates that of the people who were readmitted, fewer than expected are not on diabetic medications, and more than expected are on diabetic medications. These results are expected, as a lack of diabetic medications for diabetes indicates the individual is earlier in the disease progression and therefore their condition is less severe. Diabetic medications, if not monitored appropriately, can lead to complications such as hypo- and hyperglycemia, sometimes requiring hospitalization
##      
##         <30 Not<30
##   No  -6.13   1.93
##   Yes  3.43  -1.08

Other Bivariate Analyses

Due to the above finding that patients in the readmitted group were less likely to have an abnormal HbA1c result, I’d like to explore whether there is a connection between an abnormal HbA1c Result and there being a potentially resulting change in diabetic medication during stay.

A1c Result VS Change in Medication

table(db.expl$A1Cresult, db.expl$change)
##       
##           Ch    No
##   >7    5473  3631
##   None 24410 32718
##   Norm  1608  2133
A1C_change.chisq <- chisq.test(table(db.expl$A1Cresult, db.expl$change))
A1C_change.chisq # The results are highly significant (p<2.2e-16), indicating that there is an association between the two variables
## 
##  Pearson's Chi-squared test
## 
## data:  table(db.expl$A1Cresult, db.expl$change)
## X-squared = 965.75, df = 2, p-value < 2.2e-16
percent_contrib(A1C_change.chisq) # >7 (abnormal HbA1c Result) and Change in medication has the highest percent contribution (47.8%), with >7 and No change in medication (39.2%) following
##       
##            Ch     No
##   >7   47.836 39.145
##   None  6.808  5.571
##   Norm  0.352  0.288
round(A1C_change.chisq$residuals,2) # There are more patients than expected in the group of those with an abnormal HbA1c and a Change in medication, and fewer patients than expected in the group of those with an abnormal HbA1c and No change. This supports the idea that an abnormal HbA1c finding may contribute to a change in diabetic medication. However, more focused study would need to be completed to identify if this relationship may be causal
##       
##            Ch     No
##   >7    21.49 -19.44
##   None  -8.11   7.34
##   Norm  -1.84   1.67

Multivariate Analysis

Given that we found there was a positive association between a reduction in insulin dosage and being readmitted within 30 days, I would like compare this to discharge disposition ID as well in the subset of patients who received a reduction in insulin dosage.

Reduction in Insulin VS Readmitted VS Discharge

insDown <- db.expl[db.expl$insulin == 'Down',] # Create subset
table(insDown$discharge_disposition_id, insDown$readmitted)
##                        
##                          <30 Not<30
##   DcHome                 392   4347
##   DcOtherFacility        122    378
##   DcSNF                  149    896
##   DcHomeWithHomeService  101    883
##   Other                    8     45
ggplot(insDown, aes(x=fct_infreq(discharge_disposition_id), fill = readmitted)) + geom_bar() + theme_bw() + labs(x = 'Discharge Disposition ID', y = 'Count', title = 'Discharge and Readmission Status for Patients with Insulin Reduction') + theme(axis.text.x = element_text(angle = 90, hjust = 1))

notinsDown <- db.expl[db.expl$insulin != 'Down',]
ggplot(notinsDown, aes(x=fct_infreq(discharge_disposition_id), fill = readmitted)) + geom_bar() + theme_bw() + labs(x = 'Discharge Disposition ID', y = 'Count', title = 'Discharge and Readmission Status for Patients Without Insulin Reduction') + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Subset with insulin reduction appears to have slightly higher proportions of not readmitted in every category except for DcHome

insDown_dcid_readm.chisq <- chisq.test(table(insDown$discharge_disposition_id, insDown$readmitted))
insDown_dcid_readm.chisq # The chi-squared results are highly significant (p<2.2e-16) that there is an association between readmitted and discharge_disposition_id within the subset of patients who had a decrease in insulin dosage
## 
##  Pearson's Chi-squared test
## 
## data:  table(insDown$discharge_disposition_id, insDown$readmitted)
## X-squared = 144.23, df = 4, p-value < 2.2e-16
percent_contrib(insDown_dcid_readm.chisq) # DcOtherFacility, <30 has the highest percent contribution (63.1%) with DcHome (16.1%) following
##                        
##                            <30 Not<30
##   DcHome                16.101  1.898
##   DcOtherFacility       63.107  7.439
##   DcSNF                  9.474  1.117
##   DcHomeWithHomeService  0.051  0.006
##   Other                  0.721  0.085
round(insDown_dcid_readm.chisq$residuals,2) # There is a positive association between DcOtherFacility and <30, and a negative association between DcHome and <30. This indicates that in the <30 group, more patients than expected were DcOtherFacility, and fewer than expected were discharged home. These findings are similar to the bivariate comparison of Discharge Disposition ID and Readmitted in the whole dataset, although in this subset, the DcOtherFacility, <30 group has a higher percent contribution (63.1% vs 48.8%), suggesting that with an insulin dosage reduction, patients may be even more likely to be discharged to another facility in the readmitted group
##                        
##                           <30 Not<30
##   DcHome                -4.82   1.65
##   DcOtherFacility        9.54  -3.28
##   DcSNF                  3.70  -1.27
##   DcHomeWithHomeService -0.27   0.09
##   Other                  1.02  -0.35

On another note, given that there were positive associations between medication change and readmission, and medication change and an abnormal A1C result, but a negative association between abnormal A1C and readmission, I would like to further explore whether within the group of people with a change in medication there is still a negative association between abnormal A1C and readmission.

Readmitted <30 VS A1C Result VS Change in Medication

ch <- db.expl[db.expl$change == 'Ch',] # Create subset
table(ch$A1Cresult, ch$readmitted)
##       
##          <30 Not<30
##   >7     473   5000
##   None  2359  22051
##   Norm   139   1469
ggplot(ch, aes(x=fct_infreq(A1Cresult), fill = readmitted)) + geom_bar() + theme_bw() + labs(x = 'A1C Result', y = 'Count', title = 'A1C and Readmission Status for Patients Who Had a Med Change') + theme(axis.text.x = element_text(angle = 90, hjust = 1))

noch <- db.expl[db.expl$change != 'Ch',] 
ggplot(noch, aes(fct_infreq(A1Cresult), fill = readmitted)) + geom_bar() + theme_bw() + labs(x = 'A1C Result', y = 'Count', title = 'A1C and Readmission Status for Patients with No Med Change') + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # There appears to be a higher proportion of patients with an abnormal A1C (>7) who were not readmitted in the change in medication group

A1C_readm_ch.chisq <- chisq.test(table(ch$A1Cresult, ch$readmitted))
A1C_readm_ch.chisq # The chi-squared results are significant at the alpha=0.05 level (p=0.03509), indicating that there is an association between A1C result and readmitted within the change in medication group
## 
##  Pearson's Chi-squared test
## 
## data:  table(ch$A1Cresult, ch$readmitted)
## X-squared = 6.6995, df = 2, p-value = 0.03509
percent_contrib(A1C_readm_ch.chisq) # >7, <30 has the highest pecent contribution (54.3%)
##       
##           <30 Not<30
##   >7   54.317  5.658
##   None 20.364  2.121
##   Norm 15.884  1.655
round(A1C_readm_ch.chisq$residuals,2) # There is still a negative association between >7 and <30, indicating that even within the group of patients who had a change in medication, there were fewer patients than expected who had an abnormal A1C result and were readmitted. However, the percent contribution is lower than that of >7, <30 for the greater sample (68.3%)
##       
##          <30 Not<30
##   >7   -1.91   0.62
##   None  1.17  -0.38
##   Norm -1.03   0.33

Transform Data for Modeling

In order to use the data for modeling, we will:

These transformations will be put in a new dataframe object simply entitled “db”.

First we will define a function to complete the min-max scaling on numeric attributes.

minmax <- function(x) {
  return((x-min(x))/(max(x)-min(x)))
} # All resulting values will be between 0 and 1

Numeric Attributes

db <- db.expl # Create new dataframe entitled "db" to house data transformed for modeling
db[,1:8] <- lapply(db.expl[,1:8], minmax) # Apply min-max scaling to all numeric attributes and save them to the new dataframe

Ordinal Attribute

db$age <- minmax(as.numeric(db.expl$age)) # Convert Age to its ordered numeric IDs (1-10), and then scale the values

Categorical Attributes

# First convert categorical variables with two categories (gender, change, diabetesMed, readmitted) to 0 and 1 so that they don't get converted into two variables each with the one-hot encoding
db$gender <- as.numeric(db.expl$gender)
db$gender[db$gender == 1] <- 0 # 0 will be female
db$gender[db$gender == 2] <- 1 # 1 will be male
db$change <- as.numeric(db.expl$change)
db$change[db$change == 2] <- 0 # 1 will be change ('Ch'), 0 will be no change ('No')
db$diabetesMed <- as.numeric(db.expl$diabetesMed)
db$diabetesMed[db$diabetesMed == 1] <- 0 # 0 will be No
db$diabetesMed[db$diabetesMed == 2] <- 1 # 1 will be Yes
db$readmitted <- as.numeric(db.expl$readmitted)
db$readmitted[db$readmitted == 2] <- 0 # Not<30 will be 0, while <30 will be 1

# Now one-hot encode the remaining categorical variables (factor type)
db <- one_hot(as.data.table(db)) # Replaced 19 categorical variables with 77 binary dummy variables

Now we may review the final structure of the dataset to be modeled, and export it.

str(db) # There are now a total of 82 variables, all between 0 and 1
## Classes 'data.table' and 'data.frame':   69973 obs. of  82 variables:
##  $ time_in_hospital                              : num  0.923 0.846 0 0.615 0.154 ...
##  $ num_lab_procedures                            : num  0.511 0.244 0.382 0.351 0.229 ...
##  $ num_procedures                                : num  0.333 0.5 0 0.333 1 ...
##  $ num_medications                               : num  0.3375 0.2125 0.0875 0.2 0.1875 ...
##  $ number_outpatient                             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_emergency                              : num  0 0 0 0 0 ...
##  $ number_inpatient                              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_diagnoses                              : num  0.467 0.467 0.267 0.533 0.533 ...
##  $ age                                           : num  0.889 1 0.444 0.444 0.556 ...
##  $ A1Cresult_>7                                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ A1Cresult_None                                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ A1Cresult_Norm                                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ race_Caucasian                                : int  1 1 1 0 1 0 1 1 1 0 ...
##  $ race_AfricanAmerican                          : int  0 0 0 1 0 1 0 0 0 1 ...
##  $ race_Asian                                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ race_Hispanic                                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ race_Other                                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ gender                                        : num  0 0 1 0 1 1 0 1 1 0 ...
##  $ admission_type_id_Emergency                   : int  0 0 1 1 0 0 1 1 0 0 ...
##  $ admission_type_id_Urgent                      : int  1 0 0 0 1 1 0 0 0 0 ...
##  $ admission_type_id_Elective                    : int  0 1 0 0 0 0 0 0 1 1 ...
##  $ admission_type_id_Other                       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id_DcHome               : int  1 0 1 1 1 1 0 0 1 1 ...
##  $ discharge_disposition_id_DcOtherFacility      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id_DcSNF                : int  0 1 0 0 0 0 1 0 0 0 ...
##  $ discharge_disposition_id_DcHomeWithHomeService: int  0 0 0 0 0 0 0 1 0 0 ...
##  $ discharge_disposition_id_Other                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_source_id_PhysRef                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_source_id_TransferExtFacility       : int  1 1 0 0 1 1 0 0 1 1 ...
##  $ admission_source_id_ER                        : int  0 0 1 1 0 0 1 1 0 0 ...
##  $ admission_source_id_Other                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diag_1_390-459                                : int  1 1 0 0 1 0 1 1 1 0 ...
##  $ diag_1_Other                                  : int  0 0 1 0 0 1 0 0 0 0 ...
##  $ diag_1_250.0-250.9                            : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ diag_1_460-519                                : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ diag_1_520-579                                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diag_1_710-739                                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diag_1_780-799                                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diag_1_800-999                                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diag_2_390-459                                : int  1 0 0 1 1 0 0 1 1 0 ...
##  $ diag_2_Other                                  : int  0 1 1 0 0 1 0 0 0 1 ...
##  $ diag_2_240-279(not250)                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diag_2_250.0-250.9                            : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ diag_2_460-519                                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diag_2_580-629                                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diag_3_390-459                                : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ diag_3_Other                                  : int  1 0 0 1 0 1 0 0 1 0 ...
##  $ diag_3_240-279(not250)                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diag_3_250.0-250.9                            : int  0 0 1 0 1 0 1 0 0 0 ...
##  $ diag_3_460-519                                : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ diag_3_580-629                                : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ metformin_Down                                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ metformin_No                                  : int  1 1 1 1 1 1 0 1 0 1 ...
##  $ metformin_Steady                              : int  0 0 0 0 0 0 1 0 1 0 ...
##  $ metformin_Up                                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ glimepiride_Down                              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ glimepiride_No                                : int  1 1 1 1 1 1 1 1 0 1 ...
##  $ glimepiride_Steady                            : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ glimepiride_Up                                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ glipizide_Down                                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ glipizide_No                                  : int  0 1 0 1 1 1 1 1 1 1 ...
##  $ glipizide_Steady                              : int  1 0 1 0 0 0 0 0 0 0 ...
##  $ glipizide_Up                                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ glyburide_Down                                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ glyburide_No                                  : int  1 1 1 1 1 0 1 1 1 1 ...
##  $ glyburide_Steady                              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ glyburide_Up                                  : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ pioglitazone_Down                             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pioglitazone_No                               : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pioglitazone_Steady                           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pioglitazone_Up                               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rosiglitazone_Down                            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rosiglitazone_No                              : int  1 0 1 1 1 1 1 1 1 1 ...
##  $ rosiglitazone_Steady                          : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ rosiglitazone_Up                              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ insulin_Down                                  : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ insulin_No                                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ insulin_Steady                                : int  1 1 1 1 1 1 0 1 1 1 ...
##  $ insulin_Up                                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ change                                        : num  1 1 1 0 0 1 1 0 1 0 ...
##  $ diabetesMed                                   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ readmitted                                    : num  0 0 0 0 0 1 1 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
#write.csv(db, '/Users/amyhowe/Desktop/Dataset_Modeling.csv', row.names = F) # Export dataset for modeling